diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..4a8bcac --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,9 @@ +from pathlib import Path + +import pytest + + +@pytest.fixture +def test_file_path(): + return Path(__file__).parent / "files" / "rwservlet.pdf" + diff --git a/tests/test_detector.py b/tests/test_detector.py index 5ad5690..eebd5ce 100644 --- a/tests/test_detector.py +++ b/tests/test_detector.py @@ -1,17 +1,13 @@ -from pathlib import Path - from tika import detector -TEST_FILE_PATH = Path(__file__).parent / "files" / "rwservlet.pdf" - -def test_local_binary(): - with open(TEST_FILE_PATH, "rb") as file_obj: +def test_local_binary(test_file_path): + with open(test_file_path, "rb") as file_obj: assert detector.from_file(file_obj) == "application/pdf" -def test_local_path(): - assert detector.from_file(str(TEST_FILE_PATH)) == "application/pdf" +def test_local_path(test_file_path): + assert detector.from_file(str(test_file_path)) == "application/pdf" def test_local_buffer(): diff --git a/tests/test_from_file_service.py b/tests/test_from_file_service.py index 9831ff7..babe49a 100644 --- a/tests/test_from_file_service.py +++ b/tests/test_from_file_service.py @@ -14,56 +14,59 @@ # See the License for the specific language governing permissions and # limitations under the License. # -# python -m unittest tika.tests.test_from_file_service -import unittest from unittest import mock -import tika.parser - - -class CreateTest(unittest.TestCase): - 'test different services in from_file parsing: Content, Metadata or both in recursive mode' - - def test_default_service(self): - 'parse file using default service' - result = tika.parser.from_file( - 'https://boe.es/boe/dias/2019/12/02/pdfs/BOE-A-2019-17288.pdf') - self.assertEqual(result['metadata']['Content-Type'],'application/pdf') - self.assertIn('AUTORIDADES Y PERSONAL',result['content']) - @mock.patch('tika.parser._parse') - @mock.patch('tika.parser.parse1') - def test_remote_endpoint(self, tika_call_mock, _): - result = tika.parser.from_file( - 'filename', 'http://tika:9998/tika') - - tika_call_mock.assert_called_with( - 'all', 'filename', 'http://tika:9998/tika', headers=None, config_path=None, - requestOptions={}) - def test_default_service_explicit(self): - 'parse file using default service explicitly' - result = tika.parser.from_file( - 'https://boe.es/boe/dias/2019/12/02/pdfs/BOE-A-2019-17288.pdf', service='all') - self.assertEqual(result['metadata']['Content-Type'],'application/pdf') - self.assertIn('AUTORIDADES Y PERSONAL',result['content']) - def test_text_service(self): - 'parse file using the content only service' - result = tika.parser.from_file( - 'https://boe.es/boe/dias/2019/12/02/pdfs/BOE-A-2019-17288.pdf', service='text') - self.assertIsNone(result['metadata']) - self.assertIn('AUTORIDADES Y PERSONAL',result['content']) - def test_meta_service(self): - 'parse file using the content only service' - result = tika.parser.from_file( - 'https://boe.es/boe/dias/2019/12/02/pdfs/BOE-A-2019-17288.pdf', service='meta') - self.assertIsNone(result['content']) - self.assertEqual(result['metadata']['Content-Type'],'application/pdf') - def test_invalid_service(self): - 'parse file using an invalid service should perform the default parsing' - result = tika.parser.from_file( - 'https://boe.es/boe/dias/2019/12/02/pdfs/BOE-A-2019-17288.pdf', service='bad') - self.assertEqual(result['metadata']['Content-Type'],'application/pdf') - self.assertIn('AUTORIDADES Y PERSONAL',result['content']) - -if __name__ == '__main__': - unittest.main() +from tika import parser + +TEST_PDF_URL = "https://boe.es/boe/dias/2019/12/02/pdfs/BOE-A-2019-17288.pdf" + + +def test_default_service(): + "parse file using default service" + result = parser.from_file(TEST_PDF_URL) + assert result["metadata"]["Content-Type"] == "application/pdf" + assert "AUTORIDADES Y PERSONAL" in result["content"] + + +@mock.patch("tika.parser._parse") +@mock.patch("tika.parser.parse1") +def test_remote_endpoint(tika_call_mock, _): + result = parser.from_file("filename", "http://tika:9998/tika") + + tika_call_mock.assert_called_with( + "all", + "filename", + "http://tika:9998/tika", + headers=None, + config_path=None, + requestOptions={}, + ) + + +def test_default_service_explicit(): + "parse file using default service explicitly" + result = parser.from_file(TEST_PDF_URL, service="all") + assert result["metadata"]["Content-Type"] == "application/pdf" + assert "AUTORIDADES Y PERSONAL" in result["content"] + + +def test_text_service(): + "parse file using the content only service" + result = parser.from_file(TEST_PDF_URL, service="text") + assert result["metadata"] is None + assert "AUTORIDADES Y PERSONAL" in result["content"] + + +def test_meta_service(): + "parse file using the content only service" + result = parser.from_file(TEST_PDF_URL, service="meta") + assert result["content"] is None + assert result["metadata"]["Content-Type"] == "application/pdf" + + +def test_invalid_service(): + "parse file using an invalid service should perform the default parsing" + result = parser.from_file(TEST_PDF_URL, service="bad") + assert result["metadata"]["Content-Type"] == "application/pdf" + assert "AUTORIDADES Y PERSONAL" in result["content"] diff --git a/tests/test_language.py b/tests/test_language.py index 81a4be8..a164bb8 100644 --- a/tests/test_language.py +++ b/tests/test_language.py @@ -1,17 +1,13 @@ -from pathlib import Path - from tika import language -TEST_FILE_PATH = Path(__file__).parent / "files" / "rwservlet.pdf" - -def test_local_binary(): - with open(TEST_FILE_PATH, "rb") as file_obj: +def test_local_binary(test_file_path): + with open(test_file_path, "rb") as file_obj: assert language.from_file(file_obj) == "en" -def test_local_path(): - assert language.from_file(str(TEST_FILE_PATH)) == "en" +def test_local_path(test_file_path): + assert language.from_file(str(test_file_path)) == "en" def test_local_buffer(): diff --git a/tests/test_parser.py b/tests/test_parser.py new file mode 100644 index 0000000..5767c93 --- /dev/null +++ b/tests/test_parser.py @@ -0,0 +1,43 @@ +from http import HTTPStatus + +from tika import parser + + +def test_remote_pdf(): + """parse remote PDF""" + assert parser.from_file( + "https://upload.wikimedia.org/wikipedia/commons/4/42/Article_feedback_flow_B_-_Thank_editors.pdf") + + +def test_remote_html(): + """parse remote HTML""" + assert parser.from_file("http://nossl.sh") + + +def test_remote_mp3(): + """parse remote mp3""" + assert parser.from_file( + "https://archive.org/download/Ainst-Spaceshipdemo.mp3/Ainst-Spaceshipdemo.mp3") + + +def test_remote_jpg(): + """parse remote jpg""" + assert parser.from_file( + "https://upload.wikimedia.org/wikipedia/commons/b/b7/X_logo.jpg") + + +def test_local_binary(test_file_path): + """parse file binary""" + with open(test_file_path, "rb") as file_obj: + assert parser.from_file(file_obj) + + +def test_local_buffer(): + response = parser.from_buffer("Good evening, Dave") + assert response["status"] == HTTPStatus.OK + + +def test_local_path(test_file_path): + """parse file path""" + assert parser.from_file(str(test_file_path)) + diff --git a/tests/test_pdf.py b/tests/test_pdf.py index 33fc30f..e32a84d 100644 --- a/tests/test_pdf.py +++ b/tests/test_pdf.py @@ -1,10 +1,6 @@ -from pathlib import Path - from tika import pdf -TEST_FILE_PATH = Path(__file__).parent / "files" / "rwservlet.pdf" - -def test_local_path(): - text_pages = pdf.text_from_pdf_pages(str(TEST_FILE_PATH)) +def test_local_path(test_file_path): + text_pages = pdf.text_from_pdf_pages(str(test_file_path)) assert isinstance(text_pages, list) diff --git a/tests/test_tika.py b/tests/test_tika.py index 2073513..3bc4325 100644 --- a/tests/test_tika.py +++ b/tests/test_tika.py @@ -14,56 +14,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from http import HTTPStatus -from pathlib import Path - -import tika.parser import tika.tika - -TEST_FILE_PATH = Path(__file__).parent / "files" / "rwservlet.pdf" - - -def test_remote_pdf(): - """parse remote PDF""" - assert tika.parser.from_file( - "https://upload.wikimedia.org/wikipedia/commons/4/42/Article_feedback_flow_B_-_Thank_editors.pdf") - - -def test_remote_html(): - """parse remote HTML""" - assert tika.parser.from_file("http://nossl.sh") - - -def test_remote_mp3(): - """parse remote mp3""" - assert tika.parser.from_file( - "https://archive.org/download/Ainst-Spaceshipdemo.mp3/Ainst-Spaceshipdemo.mp3") - - -def test_remote_jpg(): - """parse remote jpg""" - assert tika.parser.from_file( - "https://upload.wikimedia.org/wikipedia/commons/b/b7/X_logo.jpg") - - -def test_local_binary(): - """parse file binary""" - with open(TEST_FILE_PATH, "rb") as file_obj: - assert tika.parser.from_file(file_obj) - - -def test_local_buffer(): - response = tika.parser.from_buffer("Good evening, Dave") - assert response["status"] == HTTPStatus.OK - - -def test_local_path(): - """parse file path""" - assert tika.parser.from_file(str(TEST_FILE_PATH)) +from tika import parser -def test_kill_server(): +def test_kill_server(test_file_path): """parse some file then kills server""" - with open(TEST_FILE_PATH, "rb") as file_obj: + with open(test_file_path, "rb") as file_obj: tika.parser.from_file(file_obj) assert tika.tika.killServer() is None diff --git a/tests/test_unpack.py b/tests/test_unpack.py index c617a4f..5a5f07f 100644 --- a/tests/test_unpack.py +++ b/tests/test_unpack.py @@ -1,5 +1,3 @@ -from tempfile import NamedTemporaryFile - from tika import unpack # Test data @@ -7,24 +5,20 @@ TEXT_ASCII = "Hello, world!!" -def test_utf8(): +def test_utf8(tmp_path): """Test UTF-8 encoding""" - with NamedTemporaryFile("w+b", prefix="tika-python", suffix=".txt", dir="/tmp") as f: - f.write(TEXT_UTF8.encode("utf8")) - f.flush() - f.seek(0) - parsed = unpack.from_file(f.name) - assert parsed["content"].strip() == TEXT_UTF8 + test_file = tmp_path / "test_utf8.txt" + test_file.write_bytes(TEXT_UTF8.encode("utf8")) + parsed = unpack.from_file(str(test_file)) + assert parsed["content"].strip() == TEXT_UTF8 -def test_ascii(): +def test_ascii(tmp_path): """Test ASCII encoding""" - with NamedTemporaryFile("w+t", prefix="tika-python", suffix=".txt", dir="/tmp") as f: - f.write(TEXT_ASCII) - f.flush() - f.seek(0) - parsed = unpack.from_file(f.name) - assert parsed["content"].strip() == TEXT_ASCII + test_file = tmp_path / "test_ascii.txt" + test_file.write_text(TEXT_ASCII) + parsed = unpack.from_file(str(test_file)) + assert parsed["content"].strip() == TEXT_ASCII def test_from_buffer():