diff --git a/libs/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py b/libs/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py index 8694aa16..0c025062 100644 --- a/libs/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py +++ b/libs/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py @@ -1,5 +1,6 @@ """Module for the DefaultConfluenceExtractor class.""" +import logging from langchain_community.document_loaders import ConfluenceLoader from extractor_api_lib.impl.types.extractor_types import ExtractorTypes @@ -10,6 +11,8 @@ ConfluenceLangchainDocument2InformationPiece, ) +logger = logging.getLogger(__name__) + class ConfluenceExtractor(InformationExtractor): """Implementation of the InformationExtractor interface for confluence.""" @@ -54,6 +57,13 @@ async def aextract_content( confluence_loader_parameters = { x.key: int(x.value) if x.value.isdigit() else x.value for x in extraction_parameters.kwargs } + if not confluence_loader_parameters.get("max_pages") or isinstance( + confluence_loader_parameters.get("max_pages"), str + ): + logging.warning( + "max_pages parameter is not set or invalid discarding it. ConfluenceLoader will use default value." + ) + confluence_loader_parameters.pop("max_pages") # Drop the document_name parameter as it is not used by the ConfluenceLoader if "document_name" in confluence_loader_parameters: confluence_loader_parameters.pop("document_name", None)