Confluence Datasource

This module contains functionality related to the Confluence datasource.

Client

ConfluenceClientFactory

Bases: SingletonFactory

Factory for creating and managing Confluence client instances.

This factory ensures only one Confluence client is created per configuration, following the singleton pattern provided by the parent SingletonFactory class.

Source code in src/extraction/datasources/confluence/client.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
class ConfluenceClientFactory(SingletonFactory):
    """
    Factory for creating and managing Confluence client instances.

    This factory ensures only one Confluence client is created per configuration,
    following the singleton pattern provided by the parent SingletonFactory class.
    """

    _configuration_class: Type = ConfluenceDatasourceConfiguration

    @classmethod
    def _create_instance(
        cls, configuration: ConfluenceDatasourceConfiguration
    ) -> Confluence:
        """
        Creates a new Confluence client instance using the provided configuration.

        Args:
            configuration: Configuration object containing Confluence connection details
                          including base URL, username, and password.

        Returns:
            A configured Confluence client instance ready for API interactions.
        """
        return Confluence(
            url=configuration.base_url,
            username=configuration.secrets.username.get_secret_value(),
            password=configuration.secrets.password.get_secret_value(),
        )

Configuration

ConfluenceDatasourceConfiguration

Bases: DatasourceConfiguration

Source code in src/extraction/datasources/confluence/configuration.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
class ConfluenceDatasourceConfiguration(DatasourceConfiguration):
    class Secrets(BaseSecrets):
        model_config = ConfigDict(
            env_file_encoding="utf-8",
            env_prefix="RAG__DATASOURCES__CONFLUENCE__",
            env_nested_delimiter="__",
            extra="ignore",
        )

        username: SecretStr = Field(
            ...,
            description="Username credential used to authenticate with the Confluence instance",
        )
        password: SecretStr = Field(
            ...,
            description="Password credential used to authenticate with the Confluence instance",
        )

    host: str = Field(
        "127.0.0.1",
        description="Hostname or IP address of the Confluence server instance",
    )
    protocol: Union[Literal["http"], Literal["https"]] = Field(
        "http",
        description="Communication protocol used to connect to the Confluence server",
    )
    name: Literal[DatasourceName.CONFLUENCE] = Field(
        ...,
        description="Identifier specifying this configuration is for a Confluence datasource",
    )
    secrets: Secrets = Field(
        None,
        description="Authentication credentials required to access the Confluence instance",
    )

    @property
    def base_url(self) -> str:
        """
        Constructs the complete base URL for the Confluence API from the protocol and host.

        Returns:
            str: The fully formed base URL to the Confluence instance
        """
        return f"{self.protocol}://{self.host}"

base_url property

Constructs the complete base URL for the Confluence API from the protocol and host.

Returns:
  • str( str ) –

    The fully formed base URL to the Confluence instance

Document

ConfluenceDocument

Bases: BaseDocument

Document representation for Confluence page content.

Extends BaseDocument to handle Confluence-specific document processing including content extraction, metadata handling, and exclusion configuration.

Source code in src/extraction/datasources/confluence/document.py
 4
 5
 6
 7
 8
 9
10
11
class ConfluenceDocument(BaseDocument):
    """Document representation for Confluence page content.

    Extends BaseDocument to handle Confluence-specific document processing including
    content extraction, metadata handling, and exclusion configuration.
    """

    pass

Manager

ConfluenceDatasourceManagerFactory

Bases: Factory

Factory for creating Confluence datasource managers.

This factory generates managers that handle the extraction of content from Confluence instances. It ensures proper configuration, reading, and parsing of Confluence content.

Attributes:
  • _configuration_class (Type) –

    Configuration class used for validating and processing Confluence-specific settings.

Source code in src/extraction/datasources/confluence/manager.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
class ConfluenceDatasourceManagerFactory(Factory):
    """Factory for creating Confluence datasource managers.

    This factory generates managers that handle the extraction of content from
    Confluence instances. It ensures proper configuration, reading, and parsing
    of Confluence content.

    Attributes:
        _configuration_class: Configuration class used for validating and processing
            Confluence-specific settings.
    """

    _configuration_class: Type = ConfluenceDatasourceConfiguration

    @classmethod
    def _create_instance(
        cls, configuration: ConfluenceDatasourceConfiguration
    ) -> BasicDatasourceManager:
        """Create a configured Confluence datasource manager.

        Sets up the necessary reader and parser components based on the provided
        configuration and assembles them into a functional manager.

        Args:
            configuration: Configuration object containing Confluence-specific
                parameters including authentication details, spaces to extract,
                and other extraction options.

        Returns:
            A fully initialized datasource manager that can extract and process
            data from Confluence.
        """
        reader = ConfluenceDatasourceReaderFactory.create(configuration)
        parser = ConfluenceDatasourceParserFactory.create(configuration)
        return BasicDatasourceManager(configuration, reader, parser)

Parser

ConfluenceDatasourceParser

Bases: BaseParser[ConfluenceDocument]

Source code in src/extraction/datasources/confluence/parser.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
class ConfluenceDatasourceParser(BaseParser[ConfluenceDocument]):

    def __init__(
        self,
        configuration: ConfluenceDatasourceConfiguration,
        parser: MarkItDown = MarkItDown(),
    ):
        """Initialize the Confluence parser with the provided configuration.

        Args:
            configuration: Configuration object containing Confluence connection details
        """
        self.configuration = configuration
        self.parser = parser

    def parse(self, page: str) -> ConfluenceDocument:
        """Parse a Confluence page into a document.

        Args:
            page: Dictionary containing Confluence page information

        Returns:
            ConfluenceDocument: Parsed document with extracted text and metadata
        """
        markdown = self._get_page_markdown(page)
        metadata = self._extract_metadata(page, self.configuration.base_url)
        return ConfluenceDocument(text=markdown, metadata=metadata)

    def _get_page_markdown(self, page: dict) -> str:
        """Extract markdown content from a Confluence page. Because of MarkItDown,
        we need to write the HTML content to a temporary file and then convert it to markdown.

        Args:
            page: Dictionary containing Confluence page details

        Returns:
            str: Markdown content of the page
        """
        html_content = page["body"]["view"]["value"]
        if not html_content:
            return ""

        with tempfile.NamedTemporaryFile(mode="w", suffix=".html") as temp_file:
            temp_file.write(html_content)
            temp_file.flush()
            return self.parser.convert(
                temp_file.name, file_extension=".html"
            ).text_content

    @staticmethod
    def _extract_metadata(page: dict, base_url: str) -> dict:
        """Extract and format page metadata.

        Args:
            page: Dictionary containing Confluence page details
            base_url: Base URL of the Confluence instance

        Returns:
            dict: Structured metadata including dates, IDs, and URLs
        """
        return {
            "created_time": page["history"]["createdDate"],
            "created_date": page["history"]["createdDate"].split("T")[0],
            "datasource": "confluence",
            "format": "md",
            "last_edited_date": page["history"]["lastUpdated"]["when"],
            "last_edited_time": page["history"]["lastUpdated"]["when"].split(
                "T"
            )[0],
            "page_id": page["id"],
            "space": page["_expandable"]["space"].split("/")[-1],
            "title": page["title"],
            "type": "page",
            "url": base_url + page["_links"]["webui"],
        }

__init__(configuration, parser=MarkItDown())

Initialize the Confluence parser with the provided configuration.

Parameters:
  • configuration (ConfluenceDatasourceConfiguration) –

    Configuration object containing Confluence connection details

Source code in src/extraction/datasources/confluence/parser.py
16
17
18
19
20
21
22
23
24
25
26
27
def __init__(
    self,
    configuration: ConfluenceDatasourceConfiguration,
    parser: MarkItDown = MarkItDown(),
):
    """Initialize the Confluence parser with the provided configuration.

    Args:
        configuration: Configuration object containing Confluence connection details
    """
    self.configuration = configuration
    self.parser = parser

parse(page)

Parse a Confluence page into a document.

Parameters:
  • page (str) –

    Dictionary containing Confluence page information

Returns:
  • ConfluenceDocument( ConfluenceDocument ) –

    Parsed document with extracted text and metadata

Source code in src/extraction/datasources/confluence/parser.py
29
30
31
32
33
34
35
36
37
38
39
40
def parse(self, page: str) -> ConfluenceDocument:
    """Parse a Confluence page into a document.

    Args:
        page: Dictionary containing Confluence page information

    Returns:
        ConfluenceDocument: Parsed document with extracted text and metadata
    """
    markdown = self._get_page_markdown(page)
    metadata = self._extract_metadata(page, self.configuration.base_url)
    return ConfluenceDocument(text=markdown, metadata=metadata)

ConfluenceDatasourceParserFactory

Bases: Factory

Source code in src/extraction/datasources/confluence/parser.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
class ConfluenceDatasourceParserFactory(Factory):
    _configuration_class: Type = ConfluenceDatasourceConfiguration

    @classmethod
    def _create_instance(
        cls, configuration: ConfluenceDatasourceConfiguration
    ) -> ConfluenceDatasourceParser:
        """Creates a Confluence parser instance.

        Args:
            configuration: Configuration object containing Confluence connection details

        Returns:
            ConfluenceDatasourceParser: Configured Confluence parser instance
        """
        return ConfluenceDatasourceParser(configuration)

Reader

ConfluenceDatasourceReader

Bases: BaseReader

Reader for extracting documents from Confluence spaces.

Implements document extraction from Confluence spaces, handling pagination and export limits.

Source code in src/extraction/datasources/confluence/reader.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
class ConfluenceDatasourceReader(BaseReader):
    """Reader for extracting documents from Confluence spaces.

    Implements document extraction from Confluence spaces, handling pagination
    and export limits.
    """

    def __init__(
        self,
        configuration: ConfluenceDatasourceConfiguration,
        client: Confluence,
        logger: logging.Logger = LoggerConfiguration.get_logger(__name__),
    ):
        """Initialize the Confluence reader.

        Args:
            configuration: Settings for Confluence access and export limits
            client: Client for Confluence API interactions
            logger: Logger instance for recording operation information
        """
        super().__init__()
        self.export_limit = configuration.export_limit
        self.client = client
        self.logger = logger

    async def read_all_async(
        self,
    ) -> AsyncIterator[dict]:
        """Asynchronously fetch all documents from Confluence.

        Retrieves pages from all global spaces in Confluence, respecting the export limit.
        Yields each page as a dictionary containing its content and metadata.

        Returns:
            AsyncIterator[dict]: An async iterator of page dictionaries containing
            page content and metadata such as body, title, and last update information
        """
        self.logger.info(
            f"Fetching pages from Confluence with limit {self.export_limit}"
        )
        response = self.client.get_all_spaces(space_type="global")
        yield_counter = 0

        for space in response["results"]:
            space_limit = (
                self.export_limit - yield_counter
                if self.export_limit is not None
                else None
            )
            if space_limit is not None and space_limit <= 0:
                break

            space_pages = self._get_all_pages(space["key"], space_limit)
            for page in tqdm(
                space_pages,
                desc=f"[Confluence] Reading {space['key']} space pages content",
                unit="pages",
            ):
                yield_counter += 1
                if (
                    self.export_limit is not None
                    and yield_counter > self.export_limit
                ):
                    break
                yield page

    def _get_all_pages(self, space: str, limit: int) -> List[dict]:
        """Fetch all pages from a specific Confluence space.

        Handles pagination internally to retrieve all pages from the specified space,
        up to the optional limit. Pages include body content and update history.

        Args:
            space: Space key to fetch pages from
            limit: Maximum number of pages to fetch (None for unlimited)

        Returns:
            List[dict]: List of page dictionaries with content and metadata
        """
        start = 0
        params = {
            "space": space,
            "start": start,
            "status": None,
            "expand": "body.view,history.lastUpdated",
        }
        all_pages = []

        try:
            while True:
                pages = self.client.get_all_pages_from_space(**params)
                all_pages.extend(pages)

                if len(pages) == 0 or ConfluenceDatasourceReader._limit_reached(
                    all_pages, limit
                ):
                    break

                start = len(all_pages)
                params["start"] = start
        except HTTPError as e:
            self.logger.warning(f"Error while fetching pages from {space}: {e}")

        return all_pages if limit is None else all_pages[:limit]

    @staticmethod
    def _limit_reached(pages: List[dict], limit: int) -> bool:
        """Check if the page retrieval limit has been reached.

        Determines whether the number of fetched pages has reached or exceeded
        the specified limit.

        Args:
            pages: List of already retrieved pages
            limit: Maximum number of pages to retrieve (None for unlimited)

        Returns:
            bool: True if limit reached or exceeded, False otherwise
        """
        return limit is not None and len(pages) >= limit

__init__(configuration, client, logger=LoggerConfiguration.get_logger(__name__))

Initialize the Confluence reader.

Parameters:
  • configuration (ConfluenceDatasourceConfiguration) –

    Settings for Confluence access and export limits

  • client (Confluence) –

    Client for Confluence API interactions

  • logger (Logger, default: get_logger(__name__) ) –

    Logger instance for recording operation information

Source code in src/extraction/datasources/confluence/reader.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def __init__(
    self,
    configuration: ConfluenceDatasourceConfiguration,
    client: Confluence,
    logger: logging.Logger = LoggerConfiguration.get_logger(__name__),
):
    """Initialize the Confluence reader.

    Args:
        configuration: Settings for Confluence access and export limits
        client: Client for Confluence API interactions
        logger: Logger instance for recording operation information
    """
    super().__init__()
    self.export_limit = configuration.export_limit
    self.client = client
    self.logger = logger

read_all_async() async

Asynchronously fetch all documents from Confluence.

Retrieves pages from all global spaces in Confluence, respecting the export limit. Yields each page as a dictionary containing its content and metadata.

Returns:
  • AsyncIterator[dict]

    AsyncIterator[dict]: An async iterator of page dictionaries containing

  • AsyncIterator[dict]

    page content and metadata such as body, title, and last update information

Source code in src/extraction/datasources/confluence/reader.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
async def read_all_async(
    self,
) -> AsyncIterator[dict]:
    """Asynchronously fetch all documents from Confluence.

    Retrieves pages from all global spaces in Confluence, respecting the export limit.
    Yields each page as a dictionary containing its content and metadata.

    Returns:
        AsyncIterator[dict]: An async iterator of page dictionaries containing
        page content and metadata such as body, title, and last update information
    """
    self.logger.info(
        f"Fetching pages from Confluence with limit {self.export_limit}"
    )
    response = self.client.get_all_spaces(space_type="global")
    yield_counter = 0

    for space in response["results"]:
        space_limit = (
            self.export_limit - yield_counter
            if self.export_limit is not None
            else None
        )
        if space_limit is not None and space_limit <= 0:
            break

        space_pages = self._get_all_pages(space["key"], space_limit)
        for page in tqdm(
            space_pages,
            desc=f"[Confluence] Reading {space['key']} space pages content",
            unit="pages",
        ):
            yield_counter += 1
            if (
                self.export_limit is not None
                and yield_counter > self.export_limit
            ):
                break
            yield page

ConfluenceDatasourceReaderFactory

Bases: Factory

Factory for creating Confluence reader instances.

Creates and configures ConfluenceDatasourceReader objects with appropriate clients based on the provided configuration.

Source code in src/extraction/datasources/confluence/reader.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
class ConfluenceDatasourceReaderFactory(Factory):
    """Factory for creating Confluence reader instances.

    Creates and configures ConfluenceDatasourceReader objects with appropriate
    clients based on the provided configuration.
    """

    _configuration_class = ConfluenceDatasourceConfiguration

    @classmethod
    def _create_instance(
        cls, configuration: ConfluenceDatasourceConfiguration
    ) -> ConfluenceDatasourceReader:
        """Creates a configured Confluence reader instance.

        Initializes the Confluence client and reader with the given configuration
        settings for credentials, URL, and export limits.

        Args:
            configuration: Confluence connection and access settings

        Returns:
            ConfluenceDatasourceReader: Fully configured reader instance
        """
        client = ConfluenceClientFactory.create(configuration)
        return ConfluenceDatasourceReader(
            configuration=configuration,
            client=client,
        )