diff --git a/README.md b/README.md index 9468ae5..7249953 100644 --- a/README.md +++ b/README.md @@ -9,16 +9,33 @@ LLMs are powerful, but their output is as good as the input you provide. LLMWhis Refer to the client documentation for more information: [LLMWhisperer Client Documentation](https://docs.unstract.com/llmwhisperer/llm_whisperer/python_client/llm_whisperer_python_client_intro/) -## A note on versions +## Client -There are two versions of the client library available in this package: +This package provides **LLMWhispererClientV2**, the client for LLMWhisperer API v2. It is required for all users on API version 2.0.0 and above. -**LLMWhispererClient**: This is the legacy version of the client library and is recommended for supporting older apps only. This version will be deprecated in the future. +Documentation is available [here](https://docs.unstract.com/llmwhisperer/). -**LLMWhispererClientV2**: This is the latest version of the client library and is recommended for all new users. It is mandatory for all users who are using LLMWhisperer API version 2.0.0 and above (All customers who have signed up after 5th November 2024). +## Running Tests -Documentation for both versions are available [here](https://docs.unstract.com/llmwhisperer/) +Install test dependencies and run all tests: +```bash +uv run --group test pytest +``` + +To run only unit tests (skipping integration tests): + +```bash +uv run --group test pytest tests/unit tests/utils_test.py +``` + +To run only integration tests: + +```bash +uv run --group test pytest tests/integration +``` + +Integration tests require a valid API key. Copy `sample.env` to `.env` and fill in your credentials before running them. ## Questions and Feedback diff --git a/sample.env b/sample.env index 4b9d712..035a208 100644 --- a/sample.env +++ b/sample.env @@ -1,4 +1,3 @@ -LLMWHISPERER_BASE_URL=https://llmwhisperer-api.unstract.com/v1 LLMWHISPERER_BASE_URL_V2=https://llmwhisperer-api.us-central.unstract.com/api/v2 LLMWHISPERER_LOG_LEVEL=DEBUG LLMWHISPERER_API_KEY= diff --git a/src/unstract/llmwhisperer/client_v2.py b/src/unstract/llmwhisperer/client_v2.py index 3aea146..00afc50 100644 --- a/src/unstract/llmwhisperer/client_v2.py +++ b/src/unstract/llmwhisperer/client_v2.py @@ -325,6 +325,48 @@ def get_highlight_data(self, whisper_hash: str, lines: str, extract_all_lines: b raise LLMWhispererClientException(err) return json.loads(response.text) + def whisper_detail(self, whisper_hash: str) -> Any: + """Retrieves the details of a text extraction process. + + This method sends a GET request to the '/whisper-detail' endpoint of the LLMWhisperer API. + The response is a JSON object containing metadata about the extraction job. + Refer to https://docs.unstract.com/llmwhisperer/llm_whisperer/apis/llm_whisperer_text_extraction_detail_api + + Args: + whisper_hash (str): The identifier returned when starting the extraction process. + + Returns: + Dict[Any, Any]: A dictionary containing the extraction details including + completed_at, mode, processed_pages, processing_started_at, + processing_time_in_seconds, requested_pages, tag, total_pages, + upload_file_size_in_kb, and whisper_hash. + + Raises: + LLMWhispererClientException: If the API request fails, it raises an exception with + the error message and status code returned by the API. + """ + self.logger.debug("whisper_detail called") + url = f"{self.base_url}/whisper-detail" + params = {"whisper_hash": whisper_hash} + self.logger.debug("url: %s", url) + self.logger.debug("whisper_hash: %s", whisper_hash) + + req = requests.Request("GET", url, headers=self.headers, params=params) + prepared = req.prepare() + response = self._send_request(prepared) + if response.status_code != 200: + if not (response.text or "").strip(): + raise LLMWhispererClientException("API error: empty response body", response.status_code) + try: + err = json.loads(response.text) + except json.JSONDecodeError as e: + response_preview = response.text[:500] + "..." if len(response.text) > 500 else response.text + raise LLMWhispererClientException( + f"API error: non-JSON response - {response_preview}", response.status_code + ) from e + raise LLMWhispererClientException(err, response.status_code) + return json.loads(response.text) + def whisper( self, file_path: str = "", diff --git a/tests/integration/client_v2_test.py b/tests/integration/client_v2_test.py index 32aa8a7..dd60297 100644 --- a/tests/integration/client_v2_test.py +++ b/tests/integration/client_v2_test.py @@ -235,6 +235,51 @@ def test_webhook(client_v2: LLMWhispererClientV2, url: str, token: str, webhook_ assert e.error_message()["status_code"] == 404 +def test_whisper_detail(client_v2: LLMWhispererClientV2, data_dir: str) -> None: + """Test whisper_detail returns extraction metadata after a whisper operation.""" + file_path = os.path.join(data_dir, "credit_card.pdf") + whisper_result = client_v2.whisper( + mode="native_text", + output_mode="text", + file_path=file_path, + wait_for_completion=True, + ) + whisper_hash = whisper_result["whisper_hash"] + + detail = client_v2.whisper_detail(whisper_hash) + + assert isinstance(detail, dict) + assert detail["whisper_hash"] == whisper_hash + expected_keys = [ + "completed_at", + "mode", + "processed_pages", + "processing_started_at", + "processing_time_in_seconds", + "requested_pages", + "tag", + "total_pages", + "upload_file_size_in_kb", + "whisper_hash", + ] + assert set(expected_keys).issubset( + detail.keys() + ), f"whisper_detail is missing expected keys: {set(expected_keys) - set(detail.keys())}" + assert detail["mode"] == "native_text" + assert detail["processed_pages"] > 0 + assert detail["total_pages"] > 0 + + +def test_whisper_detail_not_found(client_v2: LLMWhispererClientV2) -> None: + """Test whisper_detail raises exception for a nonexistent whisper_hash.""" + with pytest.raises(LLMWhispererClientException) as exc_info: + client_v2.whisper_detail("nonexistent_hash_12345") + + error = exc_info.value.error_message() + assert exc_info.value.status_code == 400 + assert "message" in error + + def assert_error_message(whisper_result: dict) -> None: assert isinstance(whisper_result, dict) assert whisper_result["status"] == "error" diff --git a/tests/unit/client_v2_test.py b/tests/unit/client_v2_test.py index a2ea822..8b1535f 100644 --- a/tests/unit/client_v2_test.py +++ b/tests/unit/client_v2_test.py @@ -39,6 +39,46 @@ def test_get_webhook_details(mocker: MockerFixture, client_v2: LLMWhispererClien assert response["webhook_details"]["url"] == WEBHOOK_URL +def test_whisper_detail_success(mocker: MockerFixture, client_v2: LLMWhispererClientV2) -> None: + """Test whisper_detail returns extraction details on success.""" + mock_send = mocker.patch("requests.Session.send") + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.text = ( + '{"whisper_hash": "abc123", "mode": "high_quality", "processed_pages": 3,' + ' "requested_pages": 3, "total_pages": 5, "upload_file_size_in_kb": 120.5,' + ' "processing_time_in_seconds": 4.2, "completed_at": "2025-01-01T00:00:00Z",' + ' "processing_started_at": "2025-01-01T00:00:00Z", "tag": "default"}' + ) + mock_send.return_value = mock_response + + response = client_v2.whisper_detail("abc123") + + assert response["whisper_hash"] == "abc123" + assert response["mode"] == "high_quality" + assert response["processed_pages"] == 3 + assert response["total_pages"] == 5 + assert response["upload_file_size_in_kb"] == 120.5 + mock_send.assert_called_once() + + +def test_whisper_detail_not_found(mocker: MockerFixture, client_v2: LLMWhispererClientV2) -> None: + """Test whisper_detail raises exception when record is not found.""" + mock_send = mocker.patch("requests.Session.send") + mock_response = MagicMock() + mock_response.status_code = 400 + mock_response.text = '{"message": "Record not found"}' + mock_send.return_value = mock_response + + with pytest.raises(LLMWhispererClientException) as exc_info: + client_v2.whisper_detail("nonexistent_hash") + + error = exc_info.value.error_message() + assert error["message"] == "Record not found" + assert exc_info.value.status_code == 400 + mock_send.assert_called_once() + + def test_whisper_json_string_response_error(mocker: MockerFixture, client_v2: LLMWhispererClientV2) -> None: """Test whisper method handles JSON string responses correctly for error cases."""