From ed19388ab6e9e8c9ad49b29195d859592b9d2017 Mon Sep 17 00:00:00 2001 From: johny Date: Mon, 16 Mar 2026 13:19:02 +0530 Subject: [PATCH 01/10] feat: add whisper_detail method to LLMWhispererClientV2 Add whisper_detail() to retrieve extraction job metadata via the /whisper-detail API endpoint. Includes unit and integration tests. Also updates README with running tests instructions and removes outdated legacy client references. Co-Authored-By: Claude Opus 4.6 --- README.md | 27 +++++++++++++--- src/unstract/llmwhisperer/client_v2.py | 33 +++++++++++++++++++ tests/integration/client_v2_test.py | 44 ++++++++++++++++++++++++++ tests/unit/client_v2_test.py | 40 +++++++++++++++++++++++ 4 files changed, 139 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 9468ae5..7249953 100644 --- a/README.md +++ b/README.md @@ -9,16 +9,33 @@ LLMs are powerful, but their output is as good as the input you provide. LLMWhis Refer to the client documentation for more information: [LLMWhisperer Client Documentation](https://docs.unstract.com/llmwhisperer/llm_whisperer/python_client/llm_whisperer_python_client_intro/) -## A note on versions +## Client -There are two versions of the client library available in this package: +This package provides **LLMWhispererClientV2**, the client for LLMWhisperer API v2. It is required for all users on API version 2.0.0 and above. -**LLMWhispererClient**: This is the legacy version of the client library and is recommended for supporting older apps only. This version will be deprecated in the future. +Documentation is available [here](https://docs.unstract.com/llmwhisperer/). -**LLMWhispererClientV2**: This is the latest version of the client library and is recommended for all new users. It is mandatory for all users who are using LLMWhisperer API version 2.0.0 and above (All customers who have signed up after 5th November 2024). +## Running Tests -Documentation for both versions are available [here](https://docs.unstract.com/llmwhisperer/) +Install test dependencies and run all tests: +```bash +uv run --group test pytest +``` + +To run only unit tests (skipping integration tests): + +```bash +uv run --group test pytest tests/unit tests/utils_test.py +``` + +To run only integration tests: + +```bash +uv run --group test pytest tests/integration +``` + +Integration tests require a valid API key. Copy `sample.env` to `.env` and fill in your credentials before running them. ## Questions and Feedback diff --git a/src/unstract/llmwhisperer/client_v2.py b/src/unstract/llmwhisperer/client_v2.py index 3aea146..41d014b 100644 --- a/src/unstract/llmwhisperer/client_v2.py +++ b/src/unstract/llmwhisperer/client_v2.py @@ -325,6 +325,39 @@ def get_highlight_data(self, whisper_hash: str, lines: str, extract_all_lines: b raise LLMWhispererClientException(err) return json.loads(response.text) + def whisper_detail(self, whisper_hash: str) -> Any: + """Retrieves the details of a text extraction process. + + This method sends a GET request to the '/whisper-detail' endpoint of the LLMWhisperer API. + The response is a JSON object containing metadata about the extraction job. + Refer to https://docs.unstract.com/llmwhisperer/llm_whisperer/apis/llm_whisperer_text_extraction_detail_api + + Args: + whisper_hash (str): The identifier returned when starting the extraction process. + + Returns: + Dict[Any, Any]: A dictionary containing the extraction details including + completed_at, mode, processed_pages, processing_started_at, + processing_time_in_seconds, requested_pages, tag, total_pages, + upload_file_size_in_kb, and whisper_hash. + + Raises: + LLMWhispererClientException: If the API request fails, it raises an exception with + the error message and status code returned by the API. + """ + self.logger.debug("whisper_detail called") + url = f"{self.base_url}/whisper-detail" + params = {"whisper_hash": whisper_hash} + self.logger.debug("url: %s", url) + req = requests.Request("GET", url, headers=self.headers, params=params) + prepared = req.prepare() + response = self._send_request(prepared) + if response.status_code != 200: + err = json.loads(response.text) + err["status_code"] = response.status_code + raise LLMWhispererClientException(err) + return json.loads(response.text) + def whisper( self, file_path: str = "", diff --git a/tests/integration/client_v2_test.py b/tests/integration/client_v2_test.py index 32aa8a7..92ff527 100644 --- a/tests/integration/client_v2_test.py +++ b/tests/integration/client_v2_test.py @@ -121,6 +121,50 @@ def test_highlight(client_v2: LLMWhispererClientV2, data_dir: str, input_file: s assert line2["page_height"] == pytest.approx(3168, abs=PAGE_HEIGHT_TOLERANCE) +def test_whisper_detail(client_v2: LLMWhispererClientV2, data_dir: str) -> None: + """Test whisper_detail returns extraction metadata after a whisper operation.""" + file_path = os.path.join(data_dir, "credit_card.pdf") + whisper_result = client_v2.whisper( + mode="native_text", + output_mode="text", + file_path=file_path, + wait_for_completion=True, + ) + whisper_hash = whisper_result["whisper_hash"] + + detail = client_v2.whisper_detail(whisper_hash) + + assert isinstance(detail, dict) + assert detail["whisper_hash"] == whisper_hash + expected_keys = [ + "completed_at", + "mode", + "processed_pages", + "processing_started_at", + "processing_time_in_seconds", + "requested_pages", + "tag", + "total_pages", + "upload_file_size_in_kb", + "whisper_hash", + ] + assert set(expected_keys).issubset( + detail.keys() + ), f"whisper_detail is missing expected keys: {set(expected_keys) - set(detail.keys())}" + assert detail["mode"] == "native_text" + assert detail["processed_pages"] > 0 + assert detail["total_pages"] > 0 + + +def test_whisper_detail_not_found(client_v2: LLMWhispererClientV2) -> None: + """Test whisper_detail raises exception for a nonexistent whisper_hash.""" + with pytest.raises(LLMWhispererClientException) as exc_info: + client_v2.whisper_detail("nonexistent_hash_12345") + + error = exc_info.value.error_message() + assert error["status_code"] == 400 + + @pytest.mark.parametrize( "output_mode, mode, url, input_file, page_count", [ diff --git a/tests/unit/client_v2_test.py b/tests/unit/client_v2_test.py index a2ea822..9041fa6 100644 --- a/tests/unit/client_v2_test.py +++ b/tests/unit/client_v2_test.py @@ -39,6 +39,46 @@ def test_get_webhook_details(mocker: MockerFixture, client_v2: LLMWhispererClien assert response["webhook_details"]["url"] == WEBHOOK_URL +def test_whisper_detail_success(mocker: MockerFixture, client_v2: LLMWhispererClientV2) -> None: + """Test whisper_detail returns extraction details on success.""" + mock_send = mocker.patch("requests.Session.send") + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.text = ( + '{"whisper_hash": "abc123", "mode": "high_quality", "processed_pages": 3,' + ' "requested_pages": 3, "total_pages": 5, "upload_file_size_in_kb": 120.5,' + ' "processing_time_in_seconds": 4.2, "completed_at": "2025-01-01T00:00:00Z",' + ' "processing_started_at": "2025-01-01T00:00:00Z", "tag": "default"}' + ) + mock_send.return_value = mock_response + + response = client_v2.whisper_detail("abc123") + + assert response["whisper_hash"] == "abc123" + assert response["mode"] == "high_quality" + assert response["processed_pages"] == 3 + assert response["total_pages"] == 5 + assert response["upload_file_size_in_kb"] == 120.5 + mock_send.assert_called_once() + + +def test_whisper_detail_not_found(mocker: MockerFixture, client_v2: LLMWhispererClientV2) -> None: + """Test whisper_detail raises exception when record is not found.""" + mock_send = mocker.patch("requests.Session.send") + mock_response = MagicMock() + mock_response.status_code = 400 + mock_response.text = '{"message": "Record not found"}' + mock_send.return_value = mock_response + + with pytest.raises(LLMWhispererClientException) as exc_info: + client_v2.whisper_detail("nonexistent_hash") + + error = exc_info.value.args[0] + assert error["message"] == "Record not found" + assert error["status_code"] == 400 + mock_send.assert_called_once() + + def test_whisper_json_string_response_error(mocker: MockerFixture, client_v2: LLMWhispererClientV2) -> None: """Test whisper method handles JSON string responses correctly for error cases.""" From 258b3968039321f2d82165ce03974671ae07615b Mon Sep 17 00:00:00 2001 From: johny Date: Mon, 16 Mar 2026 13:20:18 +0530 Subject: [PATCH 02/10] chore: remove legacy v1 base URL from sample.env Co-Authored-By: Claude Opus 4.6 --- sample.env | 1 - 1 file changed, 1 deletion(-) diff --git a/sample.env b/sample.env index 4b9d712..035a208 100644 --- a/sample.env +++ b/sample.env @@ -1,4 +1,3 @@ -LLMWHISPERER_BASE_URL=https://llmwhisperer-api.unstract.com/v1 LLMWHISPERER_BASE_URL_V2=https://llmwhisperer-api.us-central.unstract.com/api/v2 LLMWHISPERER_LOG_LEVEL=DEBUG LLMWHISPERER_API_KEY= From 0ac90d9cf5390b8c54e7b80e14c553c981feed0d Mon Sep 17 00:00:00 2001 From: johny Date: Mon, 16 Mar 2026 13:24:09 +0530 Subject: [PATCH 03/10] fix: guard whisper_detail error path against non-JSON responses Handle empty body and non-JSON error responses consistently with whisper_status, raising LLMWhispererClientException instead of letting json.JSONDecodeError propagate. Co-Authored-By: Claude Opus 4.6 --- src/unstract/llmwhisperer/client_v2.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/unstract/llmwhisperer/client_v2.py b/src/unstract/llmwhisperer/client_v2.py index 41d014b..de7dc44 100644 --- a/src/unstract/llmwhisperer/client_v2.py +++ b/src/unstract/llmwhisperer/client_v2.py @@ -353,7 +353,17 @@ def whisper_detail(self, whisper_hash: str) -> Any: prepared = req.prepare() response = self._send_request(prepared) if response.status_code != 200: - err = json.loads(response.text) + if not (response.text or "").strip(): + raise LLMWhispererClientException( + "API error: empty response body", response.status_code + ) + try: + err = json.loads(response.text) + except json.JSONDecodeError as e: + response_preview = response.text[:500] + "..." if len(response.text) > 500 else response.text + raise LLMWhispererClientException( + f"API error: non-JSON response - {response_preview}", response.status_code + ) from e err["status_code"] = response.status_code raise LLMWhispererClientException(err) return json.loads(response.text) From 55e72d9c155c979275fe152791592f47632b6673 Mon Sep 17 00:00:00 2001 From: Rahul Johny <116638720+johnyrahul@users.noreply.github.com> Date: Mon, 16 Mar 2026 13:30:30 +0530 Subject: [PATCH 04/10] Update src/unstract/llmwhisperer/client_v2.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> Signed-off-by: Rahul Johny <116638720+johnyrahul@users.noreply.github.com> --- src/unstract/llmwhisperer/client_v2.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/src/unstract/llmwhisperer/client_v2.py b/src/unstract/llmwhisperer/client_v2.py index de7dc44..de560db 100644 --- a/src/unstract/llmwhisperer/client_v2.py +++ b/src/unstract/llmwhisperer/client_v2.py @@ -353,19 +353,8 @@ def whisper_detail(self, whisper_hash: str) -> Any: prepared = req.prepare() response = self._send_request(prepared) if response.status_code != 200: - if not (response.text or "").strip(): - raise LLMWhispererClientException( - "API error: empty response body", response.status_code - ) - try: - err = json.loads(response.text) - except json.JSONDecodeError as e: - response_preview = response.text[:500] + "..." if len(response.text) > 500 else response.text - raise LLMWhispererClientException( - f"API error: non-JSON response - {response_preview}", response.status_code - ) from e err["status_code"] = response.status_code - raise LLMWhispererClientException(err) + raise LLMWhispererClientException(err, response.status_code) return json.loads(response.text) def whisper( From ff4955139f8036ecd65a4f29032b779962aa10cf Mon Sep 17 00:00:00 2001 From: Rahul Johny <116638720+johnyrahul@users.noreply.github.com> Date: Mon, 16 Mar 2026 13:35:10 +0530 Subject: [PATCH 05/10] Update tests/unit/client_v2_test.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> Signed-off-by: Rahul Johny <116638720+johnyrahul@users.noreply.github.com> --- tests/unit/client_v2_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/client_v2_test.py b/tests/unit/client_v2_test.py index 9041fa6..ce7a71b 100644 --- a/tests/unit/client_v2_test.py +++ b/tests/unit/client_v2_test.py @@ -73,7 +73,7 @@ def test_whisper_detail_not_found(mocker: MockerFixture, client_v2: LLMWhisperer with pytest.raises(LLMWhispererClientException) as exc_info: client_v2.whisper_detail("nonexistent_hash") - error = exc_info.value.args[0] + error = exc_info.value.error_message() assert error["message"] == "Record not found" assert error["status_code"] == 400 mock_send.assert_called_once() From 14b3961fce3be4dfc8f514e9f06452a6cca42817 Mon Sep 17 00:00:00 2001 From: Rahul Johny <116638720+johnyrahul@users.noreply.github.com> Date: Mon, 16 Mar 2026 13:35:20 +0530 Subject: [PATCH 06/10] Update src/unstract/llmwhisperer/client_v2.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> Signed-off-by: Rahul Johny <116638720+johnyrahul@users.noreply.github.com> --- src/unstract/llmwhisperer/client_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unstract/llmwhisperer/client_v2.py b/src/unstract/llmwhisperer/client_v2.py index de560db..54c3a9b 100644 --- a/src/unstract/llmwhisperer/client_v2.py +++ b/src/unstract/llmwhisperer/client_v2.py @@ -352,7 +352,7 @@ def whisper_detail(self, whisper_hash: str) -> Any: req = requests.Request("GET", url, headers=self.headers, params=params) prepared = req.prepare() response = self._send_request(prepared) - if response.status_code != 200: + err = json.loads(response.text) err["status_code"] = response.status_code raise LLMWhispererClientException(err, response.status_code) return json.loads(response.text) From 34bae5808e631481bc9ee592d6ed6c4595dc298f Mon Sep 17 00:00:00 2001 From: johny Date: Mon, 16 Mar 2026 13:49:08 +0530 Subject: [PATCH 07/10] fix: guard whisper_detail error path against non-JSON and non-dict responses Align error handling with whisper_status pattern: handle empty body, non-JSON responses, and pass status_code as separate exception argument. Co-Authored-By: Claude Opus 4.6 --- src/unstract/llmwhisperer/client_v2.py | 12 ++++++++++-- tests/unit/client_v2_test.py | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/unstract/llmwhisperer/client_v2.py b/src/unstract/llmwhisperer/client_v2.py index 54c3a9b..add9bfa 100644 --- a/src/unstract/llmwhisperer/client_v2.py +++ b/src/unstract/llmwhisperer/client_v2.py @@ -352,8 +352,16 @@ def whisper_detail(self, whisper_hash: str) -> Any: req = requests.Request("GET", url, headers=self.headers, params=params) prepared = req.prepare() response = self._send_request(prepared) - err = json.loads(response.text) - err["status_code"] = response.status_code + if response.status_code != 200: + if not (response.text or "").strip(): + raise LLMWhispererClientException("API error: empty response body", response.status_code) + try: + err = json.loads(response.text) + except json.JSONDecodeError as e: + response_preview = response.text[:500] + "..." if len(response.text) > 500 else response.text + raise LLMWhispererClientException( + f"API error: non-JSON response - {response_preview}", response.status_code + ) from e raise LLMWhispererClientException(err, response.status_code) return json.loads(response.text) diff --git a/tests/unit/client_v2_test.py b/tests/unit/client_v2_test.py index ce7a71b..8b1535f 100644 --- a/tests/unit/client_v2_test.py +++ b/tests/unit/client_v2_test.py @@ -75,7 +75,7 @@ def test_whisper_detail_not_found(mocker: MockerFixture, client_v2: LLMWhisperer error = exc_info.value.error_message() assert error["message"] == "Record not found" - assert error["status_code"] == 400 + assert exc_info.value.status_code == 400 mock_send.assert_called_once() From f549c4e82a1761a5d0545b0a61a27b0e8ba437f7 Mon Sep 17 00:00:00 2001 From: johny Date: Mon, 16 Mar 2026 14:47:41 +0530 Subject: [PATCH 08/10] refactor: make integration test assertion for whisper_detail_not_found less brittle Check for status code and message key presence instead of exact error string, which may change on the API side. Co-Authored-By: Claude Opus 4.6 --- tests/integration/client_v2_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration/client_v2_test.py b/tests/integration/client_v2_test.py index 92ff527..6eadcf3 100644 --- a/tests/integration/client_v2_test.py +++ b/tests/integration/client_v2_test.py @@ -162,7 +162,8 @@ def test_whisper_detail_not_found(client_v2: LLMWhispererClientV2) -> None: client_v2.whisper_detail("nonexistent_hash_12345") error = exc_info.value.error_message() - assert error["status_code"] == 400 + assert exc_info.value.status_code == 400 + assert "message" in error @pytest.mark.parametrize( From 1a88c9e8c46f47d4745709330543116448110fd0 Mon Sep 17 00:00:00 2001 From: johny Date: Mon, 16 Mar 2026 14:59:35 +0530 Subject: [PATCH 09/10] chore: add debug logging for whisper_hash in whisper_detail Co-Authored-By: Claude Opus 4.6 --- src/unstract/llmwhisperer/client_v2.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/unstract/llmwhisperer/client_v2.py b/src/unstract/llmwhisperer/client_v2.py index add9bfa..00afc50 100644 --- a/src/unstract/llmwhisperer/client_v2.py +++ b/src/unstract/llmwhisperer/client_v2.py @@ -349,6 +349,8 @@ def whisper_detail(self, whisper_hash: str) -> Any: url = f"{self.base_url}/whisper-detail" params = {"whisper_hash": whisper_hash} self.logger.debug("url: %s", url) + self.logger.debug("whisper_hash: %s", whisper_hash) + req = requests.Request("GET", url, headers=self.headers, params=params) prepared = req.prepare() response = self._send_request(prepared) From 6ccfd6d03b0ee983a602d60b8ff1a7b8733d1443 Mon Sep 17 00:00:00 2001 From: johny Date: Mon, 16 Mar 2026 15:22:27 +0530 Subject: [PATCH 10/10] fix: reorder integration tests to avoid usage count drift Move whisper_detail tests after usage-sensitive url_in_post tests to prevent extra extractions from inflating usage counters. Co-Authored-By: Claude Opus 4.6 --- tests/integration/client_v2_test.py | 90 ++++++++++++++--------------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/tests/integration/client_v2_test.py b/tests/integration/client_v2_test.py index 6eadcf3..dd60297 100644 --- a/tests/integration/client_v2_test.py +++ b/tests/integration/client_v2_test.py @@ -121,51 +121,6 @@ def test_highlight(client_v2: LLMWhispererClientV2, data_dir: str, input_file: s assert line2["page_height"] == pytest.approx(3168, abs=PAGE_HEIGHT_TOLERANCE) -def test_whisper_detail(client_v2: LLMWhispererClientV2, data_dir: str) -> None: - """Test whisper_detail returns extraction metadata after a whisper operation.""" - file_path = os.path.join(data_dir, "credit_card.pdf") - whisper_result = client_v2.whisper( - mode="native_text", - output_mode="text", - file_path=file_path, - wait_for_completion=True, - ) - whisper_hash = whisper_result["whisper_hash"] - - detail = client_v2.whisper_detail(whisper_hash) - - assert isinstance(detail, dict) - assert detail["whisper_hash"] == whisper_hash - expected_keys = [ - "completed_at", - "mode", - "processed_pages", - "processing_started_at", - "processing_time_in_seconds", - "requested_pages", - "tag", - "total_pages", - "upload_file_size_in_kb", - "whisper_hash", - ] - assert set(expected_keys).issubset( - detail.keys() - ), f"whisper_detail is missing expected keys: {set(expected_keys) - set(detail.keys())}" - assert detail["mode"] == "native_text" - assert detail["processed_pages"] > 0 - assert detail["total_pages"] > 0 - - -def test_whisper_detail_not_found(client_v2: LLMWhispererClientV2) -> None: - """Test whisper_detail raises exception for a nonexistent whisper_hash.""" - with pytest.raises(LLMWhispererClientException) as exc_info: - client_v2.whisper_detail("nonexistent_hash_12345") - - error = exc_info.value.error_message() - assert exc_info.value.status_code == 400 - assert "message" in error - - @pytest.mark.parametrize( "output_mode, mode, url, input_file, page_count", [ @@ -280,6 +235,51 @@ def test_webhook(client_v2: LLMWhispererClientV2, url: str, token: str, webhook_ assert e.error_message()["status_code"] == 404 +def test_whisper_detail(client_v2: LLMWhispererClientV2, data_dir: str) -> None: + """Test whisper_detail returns extraction metadata after a whisper operation.""" + file_path = os.path.join(data_dir, "credit_card.pdf") + whisper_result = client_v2.whisper( + mode="native_text", + output_mode="text", + file_path=file_path, + wait_for_completion=True, + ) + whisper_hash = whisper_result["whisper_hash"] + + detail = client_v2.whisper_detail(whisper_hash) + + assert isinstance(detail, dict) + assert detail["whisper_hash"] == whisper_hash + expected_keys = [ + "completed_at", + "mode", + "processed_pages", + "processing_started_at", + "processing_time_in_seconds", + "requested_pages", + "tag", + "total_pages", + "upload_file_size_in_kb", + "whisper_hash", + ] + assert set(expected_keys).issubset( + detail.keys() + ), f"whisper_detail is missing expected keys: {set(expected_keys) - set(detail.keys())}" + assert detail["mode"] == "native_text" + assert detail["processed_pages"] > 0 + assert detail["total_pages"] > 0 + + +def test_whisper_detail_not_found(client_v2: LLMWhispererClientV2) -> None: + """Test whisper_detail raises exception for a nonexistent whisper_hash.""" + with pytest.raises(LLMWhispererClientException) as exc_info: + client_v2.whisper_detail("nonexistent_hash_12345") + + error = exc_info.value.error_message() + assert exc_info.value.status_code == 400 + assert "message" in error + + def assert_error_message(whisper_result: dict) -> None: assert isinstance(whisper_result, dict) assert whisper_result["status"] == "error"