Confluence Datasource

This module contains functionality related to the Confluence datasource.

Client

ConfluenceClientFactory

Bases: SingletonFactory

Factory for creating and managing Confluence client instances.

This factory ensures only one Confluence client is created per configuration, following the singleton pattern provided by the parent SingletonFactory class.

Source code in src/extraction/datasources/confluence/client.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
class ConfluenceClientFactory(SingletonFactory):
    """
    Factory for creating and managing Confluence client instances.

    This factory ensures only one Confluence client is created per configuration,
    following the singleton pattern provided by the parent SingletonFactory class.
    """

    _configuration_class: Type = ConfluenceDatasourceConfiguration

    @classmethod
    def _create_instance(
        cls, configuration: ConfluenceDatasourceConfiguration
    ) -> Confluence:
        """
        Creates a new Confluence client instance using the provided configuration.

        Args:
            configuration: Configuration object containing Confluence connection details
                          including base URL, username, and password.

        Returns:
            A configured Confluence client instance ready for API interactions.
        """
        return Confluence(
            url=configuration.base_url,
            username=configuration.secrets.username.get_secret_value(),
            password=configuration.secrets.password.get_secret_value(),
        )

Configuration

ConfluenceDatasourceConfiguration

Bases: DatasourceConfiguration

Source code in src/extraction/datasources/confluence/configuration.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
class ConfluenceDatasourceConfiguration(DatasourceConfiguration):
    class Secrets(BaseSecrets):
        model_config = ConfigDict(
            env_file_encoding="utf-8",
            env_prefix="RAG__DATASOURCES__CONFLUENCE__",
            env_nested_delimiter="__",
            extra="ignore",
        )

        username: SecretStr = Field(
            ...,
            description="Username credential used to authenticate with the Confluence instance",
        )
        password: SecretStr = Field(
            ...,
            description="Password credential used to authenticate with the Confluence instance",
        )

    host: str = Field(
        "127.0.0.1",
        description="Hostname or IP address of the Confluence server instance",
    )
    protocol: Union[Literal["http"], Literal["https"]] = Field(
        "http",
        description="Communication protocol used to connect to the Confluence server",
    )
    name: Literal[DatasourceName.CONFLUENCE] = Field(
        ...,
        description="Identifier specifying this configuration is for a Confluence datasource",
    )
    secrets: Secrets = Field(
        None,
        description="Authentication credentials required to access the Confluence instance",
    )

    @property
    def base_url(self) -> str:
        """
        Constructs the complete base URL for the Confluence API from the protocol and host.

        Returns:
            str: The fully formed base URL to the Confluence instance
        """
        return f"{self.protocol}://{self.host}"

base_url property

Constructs the complete base URL for the Confluence API from the protocol and host.

Returns:
  • str( str ) –

    The fully formed base URL to the Confluence instance

Document

ConfluenceDocument

Bases: BaseDocument

Document representation for Confluence page content.

Extends BaseDocument to handle Confluence-specific document processing including content extraction, metadata handling, and exclusion configuration.

Source code in src/extraction/datasources/confluence/document.py
 4
 5
 6
 7
 8
 9
10
11
class ConfluenceDocument(BaseDocument):
    """Document representation for Confluence page content.

    Extends BaseDocument to handle Confluence-specific document processing including
    content extraction, metadata handling, and exclusion configuration.
    """

    pass

Manager

ConfluenceDatasourceManagerFactory

Bases: Factory

Factory for creating Confluence datasource managers.

This factory generates managers that handle the extraction of content from Confluence instances. It ensures proper configuration, reading, and parsing of Confluence content.

Attributes:
  • _configuration_class (Type) –

    Configuration class used for validating and processing Confluence-specific settings.

Source code in src/extraction/datasources/confluence/manager.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
class ConfluenceDatasourceManagerFactory(Factory):
    """Factory for creating Confluence datasource managers.

    This factory generates managers that handle the extraction of content from
    Confluence instances. It ensures proper configuration, reading, and parsing
    of Confluence content.

    Attributes:
        _configuration_class: Configuration class used for validating and processing
            Confluence-specific settings.
    """

    _configuration_class: Type = ConfluenceDatasourceConfiguration

    @classmethod
    def _create_instance(
        cls, configuration: ConfluenceDatasourceConfiguration
    ) -> BasicDatasourceManager:
        """Create a configured Confluence datasource manager.

        Sets up the necessary reader and parser components based on the provided
        configuration and assembles them into a functional manager.

        Args:
            configuration: Configuration object containing Confluence-specific
                parameters including authentication details, spaces to extract,
                and other extraction options.

        Returns:
            A fully initialized datasource manager that can extract and process
            data from Confluence.
        """
        reader = ConfluenceDatasourceReaderFactory.create(configuration)
        parser = ConfluenceDatasourceParserFactory.create(configuration)
        return BasicDatasourceManager(configuration, reader, parser)

Parser

ConfluenceDatasourceParser

Bases: BaseParser[ConfluenceDocument]

Source code in src/extraction/datasources/confluence/parser.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
class ConfluenceDatasourceParser(BaseParser[ConfluenceDocument]):

    def __init__(
        self,
        configuration: ConfluenceDatasourceConfiguration,
        parser: MarkItDown = MarkItDown(),
    ):
        """Initialize the Confluence parser with the provided configuration.

        Args:
            configuration: Configuration object containing Confluence connection details
            parser: MarkItDown instance for converting HTML to markdown
        """
        self.configuration = configuration
        self.parser = parser

    def parse(self, page: ConfluencePage) -> ConfluenceDocument:
        """Parse a Confluence page into a document.

        Args:
            page: Confluence page details

        Returns:
            ConfluenceDocument: Parsed document with extracted text and metadata
        """
        markdown = self._get_page_markdown(page)
        metadata = self._extract_metadata(page, self.configuration.base_url)
        return ConfluenceDocument(text=markdown, metadata=metadata)

    def _get_page_markdown(self, page: ConfluencePage) -> str:
        """Extract markdown content from a Confluence page. Because of MarkItDown,
        we need to write the HTML content to a temporary file and then convert it to markdown.

        Args:
            page: Confluence page details

        Returns:
            str: Markdown content of the page
        """
        html_content = page.body.view.value
        if not html_content:
            return ""

        with tempfile.NamedTemporaryFile(mode="w", suffix=".html") as temp_file:
            temp_file.write(html_content)
            temp_file.flush()
            return self.parser.convert(
                temp_file.name, file_extension=".html"
            ).text_content

    @staticmethod
    def _extract_metadata(page: ConfluencePage, base_url: str) -> dict:
        """Extract and format page metadata.

        Args:
            page: Confluence page details
            base_url: Base URL of the Confluence instance

        Returns:
            dict: Structured metadata including dates, IDs, and URLs
        """
        return {
            "created_time": page.history.createdDate,
            "created_date": page.history.createdDate.split("T")[0],
            "datasource": "confluence",
            "format": "md",
            "last_edited_date": page.history.lastUpdated.when,
            "last_edited_time": page.history.lastUpdated.when.split("T")[0],
            "page_id": page.id,
            "space": page.expandable["space"].split("/")[-1],
            "title": page.title,
            "type": "page",
            "url": base_url + page.links.webui,
        }

__init__(configuration, parser=MarkItDown())

Initialize the Confluence parser with the provided configuration.

Parameters:
  • configuration (ConfluenceDatasourceConfiguration) –

    Configuration object containing Confluence connection details

  • parser (MarkItDown, default: MarkItDown() ) –

    MarkItDown instance for converting HTML to markdown

Source code in src/extraction/datasources/confluence/parser.py
17
18
19
20
21
22
23
24
25
26
27
28
29
def __init__(
    self,
    configuration: ConfluenceDatasourceConfiguration,
    parser: MarkItDown = MarkItDown(),
):
    """Initialize the Confluence parser with the provided configuration.

    Args:
        configuration: Configuration object containing Confluence connection details
        parser: MarkItDown instance for converting HTML to markdown
    """
    self.configuration = configuration
    self.parser = parser

parse(page)

Parse a Confluence page into a document.

Parameters:
  • page (ConfluencePage) –

    Confluence page details

Returns:
  • ConfluenceDocument( ConfluenceDocument ) –

    Parsed document with extracted text and metadata

Source code in src/extraction/datasources/confluence/parser.py
31
32
33
34
35
36
37
38
39
40
41
42
def parse(self, page: ConfluencePage) -> ConfluenceDocument:
    """Parse a Confluence page into a document.

    Args:
        page: Confluence page details

    Returns:
        ConfluenceDocument: Parsed document with extracted text and metadata
    """
    markdown = self._get_page_markdown(page)
    metadata = self._extract_metadata(page, self.configuration.base_url)
    return ConfluenceDocument(text=markdown, metadata=metadata)

ConfluenceDatasourceParserFactory

Bases: Factory

Source code in src/extraction/datasources/confluence/parser.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
class ConfluenceDatasourceParserFactory(Factory):
    _configuration_class: Type = ConfluenceDatasourceConfiguration

    @classmethod
    def _create_instance(
        cls, configuration: ConfluenceDatasourceConfiguration
    ) -> ConfluenceDatasourceParser:
        """Creates a Confluence parser instance.

        Args:
            configuration: Configuration object containing Confluence connection details

        Returns:
            ConfluenceDatasourceParser: Configured Confluence parser instance
        """
        return ConfluenceDatasourceParser(configuration)

Reader

ConfluenceDatasourceReader

Bases: BaseReader

Reader for extracting documents from Confluence spaces.

Implements document extraction from Confluence spaces, handling pagination and export limits.

Source code in src/extraction/datasources/confluence/reader.py
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
class ConfluenceDatasourceReader(BaseReader):
    """Reader for extracting documents from Confluence spaces.

    Implements document extraction from Confluence spaces, handling pagination
    and export limits.
    """

    def __init__(
        self,
        configuration: ConfluenceDatasourceConfiguration,
        client: Confluence,
        logger: logging.Logger = LoggerConfiguration.get_logger(__name__),
    ):
        """Initialize the Confluence reader.

        Args:
            configuration: Settings for Confluence access and export limits
            client: Client for Confluence API interactions
            logger: Logger instance for recording operation information
        """
        super().__init__()
        self.export_limit = configuration.export_limit
        self.client = client
        self.logger = logger

    async def read_all_async(
        self,
    ) -> AsyncIterator[ConfluencePage]:
        """Asynchronously fetch all documents from Confluence.

        Retrieves pages from all global spaces in Confluence, respecting the export limit.
        Yields each page as a ConfluencePage containing its content and metadata.

        Returns:
            AsyncIterator[ConfluencePage]: An async iterator of Confluence pages.
        """
        self.logger.info(
            f"Reading pages from Confluence with limit {self.export_limit}"
        )
        response = self.client.get_all_spaces(space_type="global")
        spaces = [Space.model_validate(space) for space in response["results"]]
        yield_counter = 0

        for space in spaces:
            for page in self._get_all_pages(space.key):
                if self._limit_reached(yield_counter, self.export_limit):
                    return

                self.logger.info(
                    f"Fetched Confluence page {yield_counter}/{self.export_limit}."
                )
                yield_counter += 1
                yield page

    def _get_all_pages(self, space: str) -> Iterator[ConfluencePage]:
        """Fetch all pages from a specific Confluence space.

        Handles pagination internally to retrieve all pages from the specified space,
        up to the optional limit. Pages include body content and update history.

        Args:
            space: Space key to fetch pages from
            limit: Maximum number of pages to fetch (None for unlimited)

        Returns:
            Iterator[ConfluencePage]: Iterator of Confluence pages with content and metadata
        """
        start = 0
        params = {
            "space": space,
            "start": start,
            "status": None,
            "expand": "body.view,history.lastUpdated",
        }

        try:
            while True:
                pages_raw = self.client.get_all_pages_from_space(**params)
                pages = [
                    ConfluencePage.model_validate(page) for page in pages_raw
                ]
                if not pages:
                    return

                for page in pages:
                    yield page

                start += len(pages)
                params["start"] = start
        except HTTPError as e:
            self.logger.warning(
                f"Error while fetching Confluence pages from {space}: {e}"
            )

__init__(configuration, client, logger=LoggerConfiguration.get_logger(__name__))

Initialize the Confluence reader.

Parameters:
  • configuration (ConfluenceDatasourceConfiguration) –

    Settings for Confluence access and export limits

  • client (Confluence) –

    Client for Confluence API interactions

  • logger (Logger, default: get_logger(__name__) ) –

    Logger instance for recording operation information

Source code in src/extraction/datasources/confluence/reader.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def __init__(
    self,
    configuration: ConfluenceDatasourceConfiguration,
    client: Confluence,
    logger: logging.Logger = LoggerConfiguration.get_logger(__name__),
):
    """Initialize the Confluence reader.

    Args:
        configuration: Settings for Confluence access and export limits
        client: Client for Confluence API interactions
        logger: Logger instance for recording operation information
    """
    super().__init__()
    self.export_limit = configuration.export_limit
    self.client = client
    self.logger = logger

read_all_async() async

Asynchronously fetch all documents from Confluence.

Retrieves pages from all global spaces in Confluence, respecting the export limit. Yields each page as a ConfluencePage containing its content and metadata.

Returns:
  • AsyncIterator[ConfluencePage]

    AsyncIterator[ConfluencePage]: An async iterator of Confluence pages.

Source code in src/extraction/datasources/confluence/reader.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
async def read_all_async(
    self,
) -> AsyncIterator[ConfluencePage]:
    """Asynchronously fetch all documents from Confluence.

    Retrieves pages from all global spaces in Confluence, respecting the export limit.
    Yields each page as a ConfluencePage containing its content and metadata.

    Returns:
        AsyncIterator[ConfluencePage]: An async iterator of Confluence pages.
    """
    self.logger.info(
        f"Reading pages from Confluence with limit {self.export_limit}"
    )
    response = self.client.get_all_spaces(space_type="global")
    spaces = [Space.model_validate(space) for space in response["results"]]
    yield_counter = 0

    for space in spaces:
        for page in self._get_all_pages(space.key):
            if self._limit_reached(yield_counter, self.export_limit):
                return

            self.logger.info(
                f"Fetched Confluence page {yield_counter}/{self.export_limit}."
            )
            yield_counter += 1
            yield page

ConfluenceDatasourceReaderFactory

Bases: Factory

Factory for creating Confluence reader instances.

Creates and configures ConfluenceDatasourceReader objects with appropriate clients based on the provided configuration.

Source code in src/extraction/datasources/confluence/reader.py
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
class ConfluenceDatasourceReaderFactory(Factory):
    """Factory for creating Confluence reader instances.

    Creates and configures ConfluenceDatasourceReader objects with appropriate
    clients based on the provided configuration.
    """

    _configuration_class = ConfluenceDatasourceConfiguration

    @classmethod
    def _create_instance(
        cls, configuration: ConfluenceDatasourceConfiguration
    ) -> ConfluenceDatasourceReader:
        """Creates a configured Confluence reader instance.

        Initializes the Confluence client and reader with the given configuration
        settings for credentials, URL, and export limits.

        Args:
            configuration: Confluence connection and access settings

        Returns:
            ConfluenceDatasourceReader: Fully configured reader instance
        """
        client = ConfluenceClientFactory.create(configuration)
        return ConfluenceDatasourceReader(
            configuration=configuration,
            client=client,
        )

ConfluencePage

Bases: BaseModel

Model representing a Confluence page from the API.

Source code in src/extraction/datasources/confluence/reader.py
42
43
44
45
46
47
48
49
50
class ConfluencePage(BaseModel):
    """Model representing a Confluence page from the API."""

    id: str
    title: str
    body: Body
    history: History
    links: Links = Field(alias="_links")
    expandable: Dict[str, str] = Field(alias="_expandable")