Confluence Datasource

This module contains functionality related to the Confluence datasource.

Cleaner

ConfluenceCleaner

Bases: BaseCleaner

The ConfluenceCleaner class is a concrete implementation of BaseCleaner for cleaning Confluence documents.

Source code in src/embedding/datasources/confluence/cleaner.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
class ConfluenceCleaner(BaseCleaner):
    """
    The `ConfluenceCleaner` class is a concrete implementation of `BaseCleaner` for cleaning Confluence documents.
    """

    def clean(
        self, documents: List[ConfluenceDocument]
    ) -> List[ConfluenceDocument]:
        """
        Clean the list of Confluence documents. If the content is empty it is not added to the cleaned documents.

        :param documents: List of ConfluenceDocument objects
        :return: List of cleaned ConfluenceDocument objects
        """
        cleaned_documents = []

        for document in ConfluenceCleaner._get_documents_with_tqdm(documents):
            if not ConfluenceCleaner._has_empty_content(document):
                cleaned_documents.append(document)

        return cleaned_documents

    @staticmethod
    def _get_documents_with_tqdm(documents: List[ConfluenceDocument]):
        """
        Return the documents with tqdm progress bar if GlobalSettings.SHOW_PROGRESS is True, else return the documents as is.

        :param documents: List of Notion document objects
        """
        return tqdm(documents, desc="[Confluence] Cleaning documents")

_get_documents_with_tqdm(documents) staticmethod

Return the documents with tqdm progress bar if GlobalSettings.SHOW_PROGRESS is True, else return the documents as is.

:param documents: List of Notion document objects

Source code in src/embedding/datasources/confluence/cleaner.py
31
32
33
34
35
36
37
38
@staticmethod
def _get_documents_with_tqdm(documents: List[ConfluenceDocument]):
    """
    Return the documents with tqdm progress bar if GlobalSettings.SHOW_PROGRESS is True, else return the documents as is.

    :param documents: List of Notion document objects
    """
    return tqdm(documents, desc="[Confluence] Cleaning documents")

clean(documents)

Clean the list of Confluence documents. If the content is empty it is not added to the cleaned documents.

:param documents: List of ConfluenceDocument objects :return: List of cleaned ConfluenceDocument objects

Source code in src/embedding/datasources/confluence/cleaner.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
def clean(
    self, documents: List[ConfluenceDocument]
) -> List[ConfluenceDocument]:
    """
    Clean the list of Confluence documents. If the content is empty it is not added to the cleaned documents.

    :param documents: List of ConfluenceDocument objects
    :return: List of cleaned ConfluenceDocument objects
    """
    cleaned_documents = []

    for document in ConfluenceCleaner._get_documents_with_tqdm(documents):
        if not ConfluenceCleaner._has_empty_content(document):
            cleaned_documents.append(document)

    return cleaned_documents

Document

ConfluenceDocument

Bases: BaseDocument

Document representation for Confluence page content.

Extends BaseDocument to handle Confluence-specific document processing including content extraction, metadata handling, and exclusion configuration.

Attributes:
  • text

    Markdown-formatted page content

  • attachments

    Dictionary of page attachments (placeholder for future)

  • metadata

    Extracted page metadata including dates, IDs, and URLs

  • excluded_embed_metadata_keys

    Metadata keys to exclude from embeddings

  • excluded_llm_metadata_keys

    Metadata keys to exclude from LLM context

Note

Handles conversion of HTML content to markdown and manages metadata filtering for both embedding and LLM contexts.

Source code in src/embedding/datasources/confluence/document.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
class ConfluenceDocument(BaseDocument):
    """Document representation for Confluence page content.

    Extends BaseDocument to handle Confluence-specific document processing including
    content extraction, metadata handling, and exclusion configuration.

    Attributes:
        text: Markdown-formatted page content
        attachments: Dictionary of page attachments (placeholder for future)
        metadata: Extracted page metadata including dates, IDs, and URLs
        excluded_embed_metadata_keys: Metadata keys to exclude from embeddings
        excluded_llm_metadata_keys: Metadata keys to exclude from LLM context

    Note:
        Handles conversion of HTML content to markdown and manages metadata
        filtering for both embedding and LLM contexts.
    """

    @classmethod
    def from_page(cls, page: dict, base_url: str) -> "ConfluenceDocument":
        """Create ConfluenceDocument instance from page data.

        Args:
            page: Dictionary containing Confluence page details
            base_url: Base URL of the Confluence instance

        Returns:
            ConfluenceDocument: Configured document instance
        """
        document = cls(
            text=md(page["body"]["view"]["value"]),
            attachments={},  # TBD
            metadata=ConfluenceDocument._get_metadata(page, base_url),
        )
        document._set_excluded_embed_metadata_keys()
        document._set_excluded_llm_metadata_keys()
        return document

    def _set_excluded_embed_metadata_keys(self) -> None:
        """Configure metadata keys to exclude from embeddings.

        Identifies metadata keys not explicitly included in embedding
        processing and marks them for exclusion.
        """
        metadata_keys = self.metadata.keys()
        self.excluded_embed_metadata_keys = [
            key
            for key in metadata_keys
            if key not in self.included_embed_metadata_keys
        ]

    def _set_excluded_llm_metadata_keys(self) -> None:
        """Configure metadata keys to exclude from LLM context.

        Identifies metadata keys not explicitly included in LLM
        processing and marks them for exclusion.
        """
        metadata_keys = self.metadata.keys()
        self.excluded_llm_metadata_keys = [
            key
            for key in metadata_keys
            if key not in self.included_llm_metadata_keys
        ]

    @staticmethod
    def _get_metadata(page: dict, base_url: str) -> dict:
        """Extract and format page metadata.

        Args:
            page: Dictionary containing Confluence page details
            base_url: Base URL of the Confluence instance

        Returns:
            dict: Structured metadata including dates, IDs, and URLs
        """
        return {
            "created_time": page["history"]["createdDate"],
            "created_date": page["history"]["createdDate"].split("T")[0],
            "datasource": "confluence",
            "format": "md",
            "last_edited_date": page["history"]["lastUpdated"]["when"],
            "last_edited_time": page["history"]["lastUpdated"]["when"].split(
                "T"
            )[0],
            "page_id": page["id"],
            "space": page["_expandable"]["space"].split("/")[-1],
            "title": page["title"],
            "type": "page",
            "url": base_url + page["_links"]["webui"],
        }

_get_metadata(page, base_url) staticmethod

Extract and format page metadata.

Parameters:
  • page (dict) –

    Dictionary containing Confluence page details

  • base_url (str) –

    Base URL of the Confluence instance

Returns:
  • dict( dict ) –

    Structured metadata including dates, IDs, and URLs

Source code in src/embedding/datasources/confluence/document.py
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
@staticmethod
def _get_metadata(page: dict, base_url: str) -> dict:
    """Extract and format page metadata.

    Args:
        page: Dictionary containing Confluence page details
        base_url: Base URL of the Confluence instance

    Returns:
        dict: Structured metadata including dates, IDs, and URLs
    """
    return {
        "created_time": page["history"]["createdDate"],
        "created_date": page["history"]["createdDate"].split("T")[0],
        "datasource": "confluence",
        "format": "md",
        "last_edited_date": page["history"]["lastUpdated"]["when"],
        "last_edited_time": page["history"]["lastUpdated"]["when"].split(
            "T"
        )[0],
        "page_id": page["id"],
        "space": page["_expandable"]["space"].split("/")[-1],
        "title": page["title"],
        "type": "page",
        "url": base_url + page["_links"]["webui"],
    }

_set_excluded_embed_metadata_keys()

Configure metadata keys to exclude from embeddings.

Identifies metadata keys not explicitly included in embedding processing and marks them for exclusion.

Source code in src/embedding/datasources/confluence/document.py
44
45
46
47
48
49
50
51
52
53
54
55
def _set_excluded_embed_metadata_keys(self) -> None:
    """Configure metadata keys to exclude from embeddings.

    Identifies metadata keys not explicitly included in embedding
    processing and marks them for exclusion.
    """
    metadata_keys = self.metadata.keys()
    self.excluded_embed_metadata_keys = [
        key
        for key in metadata_keys
        if key not in self.included_embed_metadata_keys
    ]

_set_excluded_llm_metadata_keys()

Configure metadata keys to exclude from LLM context.

Identifies metadata keys not explicitly included in LLM processing and marks them for exclusion.

Source code in src/embedding/datasources/confluence/document.py
57
58
59
60
61
62
63
64
65
66
67
68
def _set_excluded_llm_metadata_keys(self) -> None:
    """Configure metadata keys to exclude from LLM context.

    Identifies metadata keys not explicitly included in LLM
    processing and marks them for exclusion.
    """
    metadata_keys = self.metadata.keys()
    self.excluded_llm_metadata_keys = [
        key
        for key in metadata_keys
        if key not in self.included_llm_metadata_keys
    ]

from_page(page, base_url) classmethod

Create ConfluenceDocument instance from page data.

Parameters:
  • page (dict) –

    Dictionary containing Confluence page details

  • base_url (str) –

    Base URL of the Confluence instance

Returns:
Source code in src/embedding/datasources/confluence/document.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
@classmethod
def from_page(cls, page: dict, base_url: str) -> "ConfluenceDocument":
    """Create ConfluenceDocument instance from page data.

    Args:
        page: Dictionary containing Confluence page details
        base_url: Base URL of the Confluence instance

    Returns:
        ConfluenceDocument: Configured document instance
    """
    document = cls(
        text=md(page["body"]["view"]["value"]),
        attachments={},  # TBD
        metadata=ConfluenceDocument._get_metadata(page, base_url),
    )
    document._set_excluded_embed_metadata_keys()
    document._set_excluded_llm_metadata_keys()
    return document

Manager

ConfluenceDatasourceManager

Bases: DatasourceManager

Manager for Confluence content extraction and processing.

Handles document retrieval, cleaning, splitting and embedding updates for Confluence workspace content. Implements the base DatasourceManager interface for Confluence-specific processing.

Source code in src/embedding/datasources/confluence/manager.py
 4
 5
 6
 7
 8
 9
10
11
12
class ConfluenceDatasourceManager(DatasourceManager):
    """Manager for Confluence content extraction and processing.

    Handles document retrieval, cleaning, splitting and embedding updates
    for Confluence workspace content. Implements the base DatasourceManager
    interface for Confluence-specific processing.
    """

    pass

Reader

ConfluenceReader

Bases: BaseReader

Reader for extracting documents from Confluence spaces.

Implements document extraction from Confluence spaces, handling pagination and export limits. Supports both synchronous and asynchronous retrieval.

Attributes:
  • export_limit

    Maximum number of documents to extract

  • confluence_client

    Client for Confluence API interactions

Source code in src/embedding/datasources/confluence/reader.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
class ConfluenceReader(BaseReader):
    """Reader for extracting documents from Confluence spaces.

    Implements document extraction from Confluence spaces, handling pagination
    and export limits. Supports both synchronous and asynchronous retrieval.

    Attributes:
        export_limit: Maximum number of documents to extract
        confluence_client: Client for Confluence API interactions
    """

    def __init__(
        self,
        configuration: ConfluenceDatasourceConfiguration,
        confluence_client: Confluence,
    ):
        """Initialize the Confluence reader.

        Args:
            configuration: Settings for Confluence access and limits
            confluence_client: Client for Confluence API interactions
        """
        super().__init__()
        self.export_limit = configuration.export_limit
        self.confluence_client = confluence_client

    def get_all_documents(self) -> List[ConfluenceDocument]:
        """Synchronously fetch all documents from Confluence.

        Returns:
            List[ConfluenceDocument]: List of extracted documents

        Note:
            Not implemented - use get_all_documents_async instead.
        """
        pass

    async def get_all_documents_async(self) -> List[ConfluenceDocument]:
        """Asynchronously fetch all documents from Confluence.

        Retrieves documents from all global spaces, respecting export limit.

        Returns:
            List[ConfluenceDocument]: List of extracted and processed documents
        """
        logging.info(
            f"Fetching documents from Confluence with limit {self.export_limit}"
        )
        response = self.confluence_client.get_all_spaces(space_type="global")
        pages = []

        for space in response["results"]:
            space_limit = (
                self.export_limit - len(pages)
                if self.export_limit is not None
                else None
            )
            pages.extend(self._get_all_pages(space["key"], space_limit))
            if (
                self.export_limit is not None
                and len(pages) >= self.export_limit
            ):
                break

        pages = (
            pages if self.export_limit is None else pages[: self.export_limit]
        )
        documents = [
            ConfluenceDocument.from_page(page, self.confluence_client.url)
            for page in pages
        ]
        return documents

    def _get_all_pages(self, space: str, limit: int) -> List[dict]:
        """Fetch all pages from a Confluence space.

        Args:
            space: Space key to fetch pages from
            limit: Maximum number of pages to fetch (None for unlimited)

        Returns:
            List[dict]: List of page details from the space
        """
        start = 0
        params = {
            "space": space,
            "start": start,
            "status": None,
            "expand": "body.view,history.lastUpdated",
        }
        all_pages = []

        try:
            with tqdm(
                desc=f"[Confluence] Reading {space}'s pages content",
                unit="pages",
            ) as pbar:
                while True:
                    pages = self.confluence_client.get_all_pages_from_space(
                        **params
                    )
                    all_pages.extend(pages)
                    pbar.update(len(pages))

                    if len(pages) == 0 or ConfluenceReader._limit_reached(
                        all_pages, limit
                    ):
                        break

                    start = len(all_pages)
                    params["start"] = start
        except HTTPError as e:
            logging.debug(f"Error while fetching pages from {space}: {e}")

        return all_pages if limit is None else all_pages[:limit]

    @staticmethod
    def _limit_reached(pages: List[dict], limit: int) -> bool:
        """Check if page limit has been reached.

        Args:
            pages: List of retrieved pages
            limit: Maximum number of pages (None for unlimited)

        Returns:
            bool: True if limit reached, False otherwise
        """
        return limit is not None and len(pages) >= limit

__init__(configuration, confluence_client)

Initialize the Confluence reader.

Parameters:
  • configuration (ConfluenceDatasourceConfiguration) –

    Settings for Confluence access and limits

  • confluence_client (Confluence) –

    Client for Confluence API interactions

Source code in src/embedding/datasources/confluence/reader.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def __init__(
    self,
    configuration: ConfluenceDatasourceConfiguration,
    confluence_client: Confluence,
):
    """Initialize the Confluence reader.

    Args:
        configuration: Settings for Confluence access and limits
        confluence_client: Client for Confluence API interactions
    """
    super().__init__()
    self.export_limit = configuration.export_limit
    self.confluence_client = confluence_client

_get_all_pages(space, limit)

Fetch all pages from a Confluence space.

Parameters:
  • space (str) –

    Space key to fetch pages from

  • limit (int) –

    Maximum number of pages to fetch (None for unlimited)

Returns:
  • List[dict]

    List[dict]: List of page details from the space

Source code in src/embedding/datasources/confluence/reader.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def _get_all_pages(self, space: str, limit: int) -> List[dict]:
    """Fetch all pages from a Confluence space.

    Args:
        space: Space key to fetch pages from
        limit: Maximum number of pages to fetch (None for unlimited)

    Returns:
        List[dict]: List of page details from the space
    """
    start = 0
    params = {
        "space": space,
        "start": start,
        "status": None,
        "expand": "body.view,history.lastUpdated",
    }
    all_pages = []

    try:
        with tqdm(
            desc=f"[Confluence] Reading {space}'s pages content",
            unit="pages",
        ) as pbar:
            while True:
                pages = self.confluence_client.get_all_pages_from_space(
                    **params
                )
                all_pages.extend(pages)
                pbar.update(len(pages))

                if len(pages) == 0 or ConfluenceReader._limit_reached(
                    all_pages, limit
                ):
                    break

                start = len(all_pages)
                params["start"] = start
    except HTTPError as e:
        logging.debug(f"Error while fetching pages from {space}: {e}")

    return all_pages if limit is None else all_pages[:limit]

_limit_reached(pages, limit) staticmethod

Check if page limit has been reached.

Parameters:
  • pages (List[dict]) –

    List of retrieved pages

  • limit (int) –

    Maximum number of pages (None for unlimited)

Returns:
  • bool( bool ) –

    True if limit reached, False otherwise

Source code in src/embedding/datasources/confluence/reader.py
131
132
133
134
135
136
137
138
139
140
141
142
@staticmethod
def _limit_reached(pages: List[dict], limit: int) -> bool:
    """Check if page limit has been reached.

    Args:
        pages: List of retrieved pages
        limit: Maximum number of pages (None for unlimited)

    Returns:
        bool: True if limit reached, False otherwise
    """
    return limit is not None and len(pages) >= limit

get_all_documents()

Synchronously fetch all documents from Confluence.

Returns:
  • List[ConfluenceDocument]

    List[ConfluenceDocument]: List of extracted documents

Note

Not implemented - use get_all_documents_async instead.

Source code in src/embedding/datasources/confluence/reader.py
41
42
43
44
45
46
47
48
49
50
def get_all_documents(self) -> List[ConfluenceDocument]:
    """Synchronously fetch all documents from Confluence.

    Returns:
        List[ConfluenceDocument]: List of extracted documents

    Note:
        Not implemented - use get_all_documents_async instead.
    """
    pass

get_all_documents_async() async

Asynchronously fetch all documents from Confluence.

Retrieves documents from all global spaces, respecting export limit.

Returns:
  • List[ConfluenceDocument]

    List[ConfluenceDocument]: List of extracted and processed documents

Source code in src/embedding/datasources/confluence/reader.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
async def get_all_documents_async(self) -> List[ConfluenceDocument]:
    """Asynchronously fetch all documents from Confluence.

    Retrieves documents from all global spaces, respecting export limit.

    Returns:
        List[ConfluenceDocument]: List of extracted and processed documents
    """
    logging.info(
        f"Fetching documents from Confluence with limit {self.export_limit}"
    )
    response = self.confluence_client.get_all_spaces(space_type="global")
    pages = []

    for space in response["results"]:
        space_limit = (
            self.export_limit - len(pages)
            if self.export_limit is not None
            else None
        )
        pages.extend(self._get_all_pages(space["key"], space_limit))
        if (
            self.export_limit is not None
            and len(pages) >= self.export_limit
        ):
            break

    pages = (
        pages if self.export_limit is None else pages[: self.export_limit]
    )
    documents = [
        ConfluenceDocument.from_page(page, self.confluence_client.url)
        for page in pages
    ]
    return documents

Splitter

ConfluenceSplitter

Bases: BaseSplitter

Source code in src/embedding/datasources/confluence/splitter.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
class ConfluenceSplitter(BaseSplitter):

    def __init__(
        self,
        markdown_splitter: BoundEmbeddingModelMarkdownSplitter,
    ):
        """
        The `ConfluenceSplitter` class is a concrete class that defines the interface for splitting documents into text nodes.

        :param markdown_splitter: MarkdownSplitter object for splitting documents
        """
        self.markdown_splitter = markdown_splitter

    def split(self, documents: List[ConfluenceDocument]) -> List[TextNode]:
        """
        Split the given list of documents into text nodes using `markdown_splitter`. Documents should be in markdown format.

        :param documents: List of Document objects
        :return: List of TextNode objects
        """
        return self.markdown_splitter.split(documents)

__init__(markdown_splitter)

The ConfluenceSplitter class is a concrete class that defines the interface for splitting documents into text nodes.

:param markdown_splitter: MarkdownSplitter object for splitting documents

Source code in src/embedding/datasources/confluence/splitter.py
14
15
16
17
18
19
20
21
22
23
def __init__(
    self,
    markdown_splitter: BoundEmbeddingModelMarkdownSplitter,
):
    """
    The `ConfluenceSplitter` class is a concrete class that defines the interface for splitting documents into text nodes.

    :param markdown_splitter: MarkdownSplitter object for splitting documents
    """
    self.markdown_splitter = markdown_splitter

split(documents)

Split the given list of documents into text nodes using markdown_splitter. Documents should be in markdown format.

:param documents: List of Document objects :return: List of TextNode objects

Source code in src/embedding/datasources/confluence/splitter.py
25
26
27
28
29
30
31
32
def split(self, documents: List[ConfluenceDocument]) -> List[TextNode]:
    """
    Split the given list of documents into text nodes using `markdown_splitter`. Documents should be in markdown format.

    :param documents: List of Document objects
    :return: List of TextNode objects
    """
    return self.markdown_splitter.split(documents)

Builders

ConfluenceCleanerBuilder

Builder for creating Confluence content cleaner instances.

Provides factory method to create Cleaner objects for Confluence content.

Source code in src/embedding/datasources/confluence/builders.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
class ConfluenceCleanerBuilder:
    """Builder for creating Confluence content cleaner instances.

    Provides factory method to create Cleaner objects for Confluence content.
    """

    @staticmethod
    @inject
    def build() -> ConfluenceCleaner:
        """Creates a content cleaner for Confluence.

        Returns:
            Cleaner: Configured cleaner instance
        """
        return ConfluenceCleaner()

build() staticmethod

Creates a content cleaner for Confluence.

Returns:
  • Cleaner( ConfluenceCleaner ) –

    Configured cleaner instance

Source code in src/embedding/datasources/confluence/builders.py
108
109
110
111
112
113
114
115
116
@staticmethod
@inject
def build() -> ConfluenceCleaner:
    """Creates a content cleaner for Confluence.

    Returns:
        Cleaner: Configured cleaner instance
    """
    return ConfluenceCleaner()

ConfluenceClientBuilder

Builder for creating Confluence API client instances.

Provides factory method to create configured Confluence API clients.

Source code in src/embedding/datasources/confluence/builders.py
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
class ConfluenceClientBuilder:
    """Builder for creating Confluence API client instances.

    Provides factory method to create configured Confluence API clients.
    """

    @staticmethod
    @inject
    def build(configuration: ConfluenceDatasourceConfiguration) -> Confluence:
        """Creates a configured Confluence API client.

        Args:
            configuration: Confluence connection settings

        Returns:
            Confluence: Configured API client instance
        """
        return Confluence(
            url=configuration.base_url,
            username=configuration.secrets.username.get_secret_value(),
            password=configuration.secrets.password.get_secret_value(),
        )

build(configuration) staticmethod

Creates a configured Confluence API client.

Parameters:
  • configuration (ConfluenceDatasourceConfiguration) –

    Confluence connection settings

Returns:
  • Confluence( Confluence ) –

    Configured API client instance

Source code in src/embedding/datasources/confluence/builders.py
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
@staticmethod
@inject
def build(configuration: ConfluenceDatasourceConfiguration) -> Confluence:
    """Creates a configured Confluence API client.

    Args:
        configuration: Confluence connection settings

    Returns:
        Confluence: Configured API client instance
    """
    return Confluence(
        url=configuration.base_url,
        username=configuration.secrets.username.get_secret_value(),
        password=configuration.secrets.password.get_secret_value(),
    )

ConfluenceDatasourceManagerBuilder

Builder for creating Confluence datasource manager instances.

Provides factory method to create configured ConfluenceDatasourceManager with required components for content processing.

Source code in src/embedding/datasources/confluence/builders.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
class ConfluenceDatasourceManagerBuilder:
    """Builder for creating Confluence datasource manager instances.

    Provides factory method to create configured ConfluenceDatasourceManager
    with required components for content processing.
    """

    @staticmethod
    @inject
    def build(
        configuration: ConfluenceDatasourceConfiguration,
        reader: ConfluenceReader,
        cleaner: Cleaner,
        splitter: ConfluenceSplitter,
    ) -> ConfluenceDatasourceManager:
        """Creates a configured Confluence datasource manager.

        Args:
            configuration: Confluence access and processing settings
            reader: Component for reading Confluence content
            cleaner: Component for cleaning raw content
            splitter: Component for splitting content into chunks

        Returns:
            ConfluenceDatasourceManager: Configured manager instance
        """
        return ConfluenceDatasourceManager(
            configuration=configuration,
            reader=reader,
            cleaner=cleaner,
            splitter=splitter,
        )

build(configuration, reader, cleaner, splitter) staticmethod

Creates a configured Confluence datasource manager.

Parameters:
  • configuration (ConfluenceDatasourceConfiguration) –

    Confluence access and processing settings

  • reader (ConfluenceReader) –

    Component for reading Confluence content

  • cleaner (Cleaner) –

    Component for cleaning raw content

  • splitter (ConfluenceSplitter) –

    Component for splitting content into chunks

Returns:
  • ConfluenceDatasourceManager( ConfluenceDatasourceManager ) –

    Configured manager instance

Source code in src/embedding/datasources/confluence/builders.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
@staticmethod
@inject
def build(
    configuration: ConfluenceDatasourceConfiguration,
    reader: ConfluenceReader,
    cleaner: Cleaner,
    splitter: ConfluenceSplitter,
) -> ConfluenceDatasourceManager:
    """Creates a configured Confluence datasource manager.

    Args:
        configuration: Confluence access and processing settings
        reader: Component for reading Confluence content
        cleaner: Component for cleaning raw content
        splitter: Component for splitting content into chunks

    Returns:
        ConfluenceDatasourceManager: Configured manager instance
    """
    return ConfluenceDatasourceManager(
        configuration=configuration,
        reader=reader,
        cleaner=cleaner,
        splitter=splitter,
    )

ConfluenceReaderBuilder

Builder for creating Confluence reader instances.

Provides factory method to create configured ConfluenceReader objects.

Source code in src/embedding/datasources/confluence/builders.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
class ConfluenceReaderBuilder:
    """Builder for creating Confluence reader instances.

    Provides factory method to create configured ConfluenceReader objects.
    """

    @staticmethod
    @inject
    def build(
        configuration: ConfluenceDatasourceConfiguration,
        confluence_client: Confluence,
    ) -> ConfluenceReader:
        """Creates a configured Confluence reader.

        Args:
            configuration: Confluence access settings
            confluence_client: Client for Confluence API interaction

        Returns:
            ConfluenceReader: Configured reader instance
        """
        return ConfluenceReader(
            configuration=configuration,
            confluence_client=confluence_client,
        )

build(configuration, confluence_client) staticmethod

Creates a configured Confluence reader.

Parameters:
  • configuration (ConfluenceDatasourceConfiguration) –

    Confluence access settings

  • confluence_client (Confluence) –

    Client for Confluence API interaction

Returns:
  • ConfluenceReader( ConfluenceReader ) –

    Configured reader instance

Source code in src/embedding/datasources/confluence/builders.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
@staticmethod
@inject
def build(
    configuration: ConfluenceDatasourceConfiguration,
    confluence_client: Confluence,
) -> ConfluenceReader:
    """Creates a configured Confluence reader.

    Args:
        configuration: Confluence access settings
        confluence_client: Client for Confluence API interaction

    Returns:
        ConfluenceReader: Configured reader instance
    """
    return ConfluenceReader(
        configuration=configuration,
        confluence_client=confluence_client,
    )

ConfluenceSplitterBuilder

Source code in src/embedding/datasources/confluence/builders.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
class ConfluenceSplitterBuilder:

    @staticmethod
    @inject
    def build(
        markdown_splitter: BoundEmbeddingModelMarkdownSplitter,
    ) -> ConfluenceSplitter:
        """
        Builds a `ConfluenceSplitter` instance using `MarkdownSplitter`.

        :param markdown_splitter: MarkdownSplitter object
        :return: ConfluenceSplitter object
        """
        return ConfluenceSplitter(markdown_splitter)

build(markdown_splitter) staticmethod

Builds a ConfluenceSplitter instance using MarkdownSplitter.

:param markdown_splitter: MarkdownSplitter object :return: ConfluenceSplitter object

Source code in src/embedding/datasources/confluence/builders.py
121
122
123
124
125
126
127
128
129
130
131
132
@staticmethod
@inject
def build(
    markdown_splitter: BoundEmbeddingModelMarkdownSplitter,
) -> ConfluenceSplitter:
    """
    Builds a `ConfluenceSplitter` instance using `MarkdownSplitter`.

    :param markdown_splitter: MarkdownSplitter object
    :return: ConfluenceSplitter object
    """
    return ConfluenceSplitter(markdown_splitter)