Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@


class FileUploader(UploaderBase):
"""File uploader implementation."""

@abstractmethod
async def upload_file(
Expand All @@ -16,7 +17,7 @@ async def upload_file(
file: UploadFile,
) -> None:
"""
Uploads a source file for content extraction.
Upload a source file for content extraction.

Parameters
----------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ async def upload_source(
timeout: Optional[float],
) -> None:
"""
Uploads the parameters for source content extraction.
Upload the parameters for source content extraction.

Parameters
----------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@ class UploaderBase:
"""Base class for uploader API endpoints."""

def __init__(self):
"""
Initialize the UploaderBase.
"""
"""Initialize the UploaderBase."""
self._background_threads = []

def _prune_background_threads(self) -> list[Thread]:
Expand Down
4 changes: 2 additions & 2 deletions libs/admin-api-lib/src/admin_api_lib/apis/admin_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ async def upload_file(
request: Request,
) -> None:
"""
Uploads user selected sources.
Upload user selected sources.

Parameters
----------
Expand Down Expand Up @@ -181,7 +181,7 @@ async def upload_source(
key_value_pair: List[KeyValuePair] = Body(None, description="The key-value pairs for the source"),
) -> None:
"""
Uploads user selected sources.
Upload user selected sources.

Parameters
----------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import io
import logging
import traceback

from fastapi import HTTPException, Response, status

Expand Down Expand Up @@ -54,10 +53,8 @@ async def adocument_reference_id_get(self, identification: str) -> Response:
self._file_service.download_file(identification, document_buffer)
logger.debug("DONE retrieving document with id: %s", identification)
document_data = document_buffer.getvalue()
except Exception as e:
logger.error(
"Error retrieving document with id: %s. Error: %s %s", identification, e, traceback.format_exc()
)
except Exception:
logger.exception("Error retrieving document with id: %s", identification)
raise ValueError(f"Document with id '{identification}' not found.")
finally:
document_buffer.close()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Module for the default file uploader implementation."""

import logging
from pathlib import Path
import traceback
import urllib
import tempfile
import asyncio
Expand Down Expand Up @@ -78,7 +79,7 @@ async def upload_file(
file: UploadFile,
) -> None:
"""
Uploads a source file for content extraction.
Upload a source file for content extraction.

Parameters
----------
Expand Down Expand Up @@ -109,7 +110,7 @@ async def upload_file(
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e))
except Exception as e:
self._key_value_store.upsert(source_name, Status.ERROR)
logger.error("Error while uploading %s = %s", source_name, str(e))
logger.exception("Error while uploading %s", source_name)
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))

def _log_task_exception(self, task: asyncio.Task) -> None:
Expand All @@ -124,19 +125,16 @@ def _log_task_exception(self, task: asyncio.Task) -> None:
if task.done() and not task.cancelled():
try:
task.result() # This will raise the exception if one occurred
except Exception as e:
logger.error("Background task failed with exception: %s", str(e))
logger.debug("Background task exception traceback: %s", traceback.format_exc())
except Exception:
logger.exception("Background task failed with exception")

def _prune_background_tasks(self) -> None:
"""
Remove completed background tasks from the list.
"""
"""Remove completed background tasks from the list."""
self._background_tasks = [task for task in self._background_tasks if not task.done()]

def _check_if_already_in_processing(self, source_name: str) -> None:
"""
Checks if the source is already in processing state.
Check if the source is already in processing state.

Parameters
----------
Expand Down Expand Up @@ -196,9 +194,9 @@ async def _handle_source_upload(
await asyncio.to_thread(self._rag_api.upload_information_piece, rag_information_pieces)
self._key_value_store.upsert(source_name, Status.READY)
logger.info("Source uploaded successfully: %s", source_name)
except Exception as e:
except Exception:
self._key_value_store.upsert(source_name, Status.ERROR)
logger.error("Error while uploading %s = %s", source_name, str(e))
logger.exception("Error while uploading %s", source_name)

def _add_file_url(self, file_name: str, base_url: str, chunked_documents: list[Document]):
document_url = f"{base_url.rstrip('/')}/document_reference/{urllib.parse.quote_plus(file_name)}"
Expand Down Expand Up @@ -229,6 +227,6 @@ async def _asave_new_document(

self._file_service.upload_file(Path(temp_file_path), filename)
return filename
except Exception as e:
logger.error("Error during document saving: %s %s", e, traceback.format_exc())
except Exception:
logger.exception("Error during document saving")
self._key_value_store.upsert(source_name, Status.ERROR)
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Module for the default source uploader implementation."""

import logging
import asyncio
from threading import Thread
Expand Down Expand Up @@ -28,6 +30,7 @@


class DefaultSourceUploader(SourceUploader):
"""Default implementation of the SourceUploader."""

def __init__(
self,
Expand Down Expand Up @@ -78,7 +81,7 @@ async def upload_source(
kwargs: list[KeyValuePair],
) -> None:
"""
Uploads the parameters for source content extraction.
Upload the parameters for source content extraction.

Parameters
----------
Expand All @@ -95,7 +98,6 @@ async def upload_source(
-------
None
"""

self._prune_background_threads()

source_name = f"{source_type}:{sanitize_document_name(name)}"
Expand All @@ -111,12 +113,12 @@ async def upload_source(
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e))
except Exception as e:
self._key_value_store.upsert(source_name, Status.ERROR)
logger.error("Error while uploading %s = %s", source_name, str(e))
logger.exception("Error while uploading %s", source_name)
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))

def _check_if_already_in_processing(self, source_name: str) -> None:
"""
Checks if the source is already in processing state.
Check if the source is already in processing state.

Parameters
----------
Expand Down Expand Up @@ -197,6 +199,6 @@ async def _handle_source_upload(
await asyncio.to_thread(self._rag_api.upload_information_piece, rag_information_pieces)
self._key_value_store.upsert(source_name, Status.READY)
logger.info("Source uploaded successfully: %s", source_name)
except Exception as e:
except Exception:
self._key_value_store.upsert(source_name, Status.ERROR)
logger.error("Error while uploading %s = %s", source_name, str(e))
logger.exception("Error while uploading %s", source_name)
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Class to handle I/O with S3 storage."""

import logging
import traceback
from pathlib import Path
from typing import BinaryIO

Expand Down Expand Up @@ -125,7 +124,7 @@ def delete_file(self, file_name: str) -> None:
try:
file_name = f"/{file_name}" if not file_name.startswith("/") else file_name
self._s3_client.delete_object(Bucket=self._s3_settings.bucket, Key=file_name)
logger.info(f"File {file_name} successfully deleted.")
except Exception as e:
logger.error("Error deleting file %s: %s %s" % (file_name, e, traceback.format_exc()))
logger.info("File %s successfully deleted.", file_name)
except Exception:
logger.exception("Error deleting file %s", file_name)
raise
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Module for the LangchainSummarizer class."""

import logging
import traceback
from typing import Optional

from langchain.text_splitter import RecursiveCharacterTextSplitter
Expand Down Expand Up @@ -66,7 +65,7 @@ async def ainvoke(self, query: SummarizerInput, config: Optional[RunnableConfig]
assert query, "Query is empty: %s" % query # noqa S101
config = ensure_config(config)
tries_remaining = config.get("configurable", {}).get("tries_remaining", 3)
logger.debug("Tries remaining %d" % tries_remaining)
logger.debug("Tries remaining %d", tries_remaining)

if tries_remaining < 0:
raise Exception("Summary creation failed.")
Expand All @@ -81,8 +80,8 @@ async def ainvoke(self, query: SummarizerInput, config: Optional[RunnableConfig]
# Extract content from AIMessage if it's not already a string
content = result.content if hasattr(result, "content") else str(result)
outputs.append(content)
except Exception as e:
logger.error("Error in summarizing langchain doc: %s %s", e, traceback.format_exc())
except Exception:
logger.exception("Error in summarizing langchain doc")
config["tries_remaining"] = tries_remaining - 1
result = await self._create_chain().ainvoke({"text": langchain_document.page_content}, config)
# Extract content from AIMessage if it's not already a string
Expand All @@ -93,8 +92,9 @@ async def ainvoke(self, query: SummarizerInput, config: Optional[RunnableConfig]
return outputs[0]
summary = " ".join(outputs)
logger.debug(
"Reduced number of chars from %d to %d"
% (len("".join([x.page_content for x in langchain_documents])), len(summary))
"Reduced number of chars from %d to %d",
len("".join([x.page_content for x in langchain_documents])),
len(summary),
)
return await self.ainvoke(summary, config)

Expand Down
4 changes: 2 additions & 2 deletions libs/extractor-api-lib/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ per-file-ignores = """
./src/extractor_api_lib/apis/extractor_api.py: B008,WOT001,
./src/extractor_api_lib/impl/extractor_api_impl.py: B008,
./src/extractor_api_lib/container.py: CCE002,CCE001,
./src/extractor_api_lib/apis/extractor_api_base.py: WOT001,
./tests/*: S101,E501,
./src/extractor_api_lib/apis/extractor_api_base.py: WOT001,D105,
./tests/*: S101,E501,D105,D100,D102,
"""

[tool.black]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Module for the FileExtractor class."""

from abc import ABC, abstractmethod
from extractor_api_lib.models.extraction_request import ExtractionRequest
from extractor_api_lib.models.information_piece import InformationPiece
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Module for the SourceExtractor class."""

from abc import ABC, abstractmethod

from extractor_api_lib.models.extraction_parameters import ExtractionParameters
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,14 @@ class InformationExtractor(ABC):

@property
@abstractmethod
def extractor_type(self) -> ExtractorTypes: ...
def extractor_type(self) -> ExtractorTypes:
"""Return the type of the extractor.

Returns
-------
ExtractorTypes
The type of the extractor.
"""

@abstractmethod
async def aextract_content(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import logging
from pathlib import Path
import tempfile
import traceback


from extractor_api_lib.api_endpoints.file_extractor import FileExtractor
Expand Down Expand Up @@ -76,5 +75,5 @@ async def aextract_information(self, extraction_request: ExtractionRequest) -> l
)
return [self._mapper.map_internal_to_external(x) for x in results if x.page_content is not None]
except Exception as e:
logger.error("Error during document parsing: %s %s", e, traceback.format_exc())
logger.exception("Error during document parsing")
raise e
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ def __init__(

@property
def extractor_type(self) -> ExtractorTypes:
"""Return the type of the extractor.

Returns
-------
ExtractorTypes
The type of the extractor.
"""
return ExtractorTypes.CONFLUENCE

async def aextract_content(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ async def aextract_content(self, file_path: Path, name: str) -> list[InternalInf
)
pdf_elements += new_pdf_elements

logger.info(f"Extraction completed. Found {len(pdf_elements)} information pieces.")
logger.info("Extraction completed. Found %d information pieces.", len(pdf_elements))
return pdf_elements

def _is_text_based(self, page: Page) -> bool:
Expand Down Expand Up @@ -200,8 +200,8 @@ def _extract_tables_from_text_page(
table_df = pd.DataFrame(table_data)
try:
converted_table = self._dataframe_converter.convert(table_df)
except TypeError as e:
logger.error(f"Error while converting table to string: {e}")
except TypeError:
logger.exception("Error while converting table to string")
continue
if not converted_table.strip():
continue
Expand All @@ -215,8 +215,8 @@ def _extract_tables_from_text_page(
information_id=hash_datetime(),
)
)
except Exception as e:
logger.warning(f"Failed to find tables on page {page_index}: {e}")
except Exception:
logger.exception("Failed to find tables on page %d", page_index)

return table_elements

Expand Down Expand Up @@ -321,19 +321,19 @@ def _extract_tables_from_scanned_page(
},
)
)
except Exception as e:
logger.warning(f"Failed to convert Camelot table {i + 1}: {e}")
except Exception:
logger.exception("Failed to convert Camelot table %d", i + 1)

except Exception as e:
logger.debug(f"Camelot table extraction failed for page {page_index}: {e}")
except Exception:
logger.exception("Camelot table extraction failed for page %d", page_index)

return table_elements

def _extract_text_from_text_page(self, page: Page) -> str:
try:
return page.extract_text() or ""
except Exception as e:
logger.warning(f"Failed to extract text with pdfplumber: {e}")
except Exception:
logger.exception("Failed to extract text with pdfplumber")
return ""

def _extract_content_from_page(
Expand Down
Loading