From 7625c0b22c52cbe63e70e2149ef69ef1d86994e4 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Tue, 21 Apr 2026 14:14:17 -0400 Subject: [PATCH] refactor: make codebase compatible with Python 3.11 --- README.md | 4 +- docs/local_development.md | 2 +- pyproject.toml | 8 +- .../flavor/bigquery_flavor_service.py | 3 +- testgen/common/models/scores.py | 2 +- testgen/ui/queries/profiling_queries.py | 144 ++++++++++-------- testgen/ui/scripts/patch_streamlit.py | 2 +- testgen/ui/views/data_catalog.py | 5 +- .../dialogs/table_create_script_dialog.py | 3 +- testgen/ui/views/monitors_dashboard.py | 3 +- testgen/ui/views/profiling_results.py | 31 ++-- testgen/ui/views/score_details.py | 2 +- testgen/ui/views/score_explorer.py | 14 +- testgen/ui/views/test_results.py | 6 +- 14 files changed, 135 insertions(+), 94 deletions(-) diff --git a/README.md b/README.md index 4b6a5726..5383516d 100644 --- a/README.md +++ b/README.md @@ -84,14 +84,14 @@ As an alternative to the Docker Compose [installation with dk-installer (recomme | Software | Tested Versions | Command to check version | |----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------|------------------------------| -| [Python](https://www.python.org/downloads/)
- Most Linux and macOS systems have Python pre-installed.
- On Windows machines, you will need to download and install it. | 3.12 | `python3 --version` | +| [Python](https://www.python.org/downloads/)
- Most Linux and macOS systems have Python pre-installed.
- On Windows machines, you will need to download and install it. | 3.11, 3.12, 3.13 | `python3 --version` | | [PostgreSQL](https://www.postgresql.org/download/) | 14.1, 15.8, 16.4 | `psql --version`| ### Install the TestGen package We recommend using a Python virtual environment to avoid any dependency conflicts with other applications installed on your machine. The [venv](https://docs.python.org/3/library/venv.html#creating-virtual-environments) module, which is part of the Python standard library, or other third-party tools, like [virtualenv](https://virtualenv.pypa.io/en/latest/) or [conda](https://docs.conda.io/en/latest/), can be used. -Create and activate a virtual environment with a TestGen-compatible version of Python (`>=3.12`). The steps may vary based on your operating system and Python installation - the [Python packaging user guide](https://packaging.python.org/en/latest/tutorials/installing-packages/) is a useful reference. +Create and activate a virtual environment with a TestGen-compatible version of Python (`>=3.11`). The steps may vary based on your operating system and Python installation - the [Python packaging user guide](https://packaging.python.org/en/latest/tutorials/installing-packages/) is a useful reference. _On Linux/Mac_ ```shell diff --git a/docs/local_development.md b/docs/local_development.md index cff533ec..95e2948f 100644 --- a/docs/local_development.md +++ b/docs/local_development.md @@ -23,7 +23,7 @@ git clone https://github.com/YOUR-USERNAME/dataops-testgen We recommend using a Python virtual environment to avoid any dependency conflicts with other applications installed on your machine. The [venv](https://docs.python.org/3/library/venv.html#creating-virtual-environments) module, which is part of the Python standard library, or other third-party tools, like [virtualenv](https://virtualenv.pypa.io/en/latest/) or [conda](https://docs.conda.io/en/latest/), can be used. -From the root of your local repository, create and activate a virtual environment with a TestGen-compatible version of Python (`>=3.12`). The steps may vary based on your operating system and Python installation - the [Python packaging user guide](https://packaging.python.org/en/latest/tutorials/installing-packages/) is a useful reference. +From the root of your local repository, create and activate a virtual environment with a TestGen-compatible version of Python (`>=3.11`; we develop on 3.13). The steps may vary based on your operating system and Python installation - the [Python packaging user guide](https://packaging.python.org/en/latest/tutorials/installing-packages/) is a useful reference. _On Linux/Mac_ ```shell diff --git a/pyproject.toml b/pyproject.toml index 43851025..63406242 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,11 +21,13 @@ classifiers = [ "License :: OSI Approved :: Apache Software License", "Development Status :: 5 - Production/Stable", "Operating System :: OS Independent", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Topic :: System :: Monitoring", ] keywords = [ "dataops", "data", "quality", "testing", "database", "profiling" ] -requires-python = ">=3.12" +requires-python = ">=3.11" dependencies = [ "PyYAML==6.0.3", @@ -170,7 +172,7 @@ filterwarnings = [ # for an explanation of their functionality. # WARNING: When changing mypy configurations, be sure to test them after removing your .mypy_cache [tool.mypy] -python_version = "3.13" +python_version = "3.11" check_untyped_defs = true disallow_untyped_decorators = true disallow_untyped_defs = true @@ -211,7 +213,7 @@ exclude = [ ] [tool.ruff] -target-version = "py310" +target-version = "py311" line-length = 120 indent-width = 4 include = [ diff --git a/testgen/common/database/flavor/bigquery_flavor_service.py b/testgen/common/database/flavor/bigquery_flavor_service.py index 47150a73..5facf1e2 100644 --- a/testgen/common/database/flavor/bigquery_flavor_service.py +++ b/testgen/common/database/flavor/bigquery_flavor_service.py @@ -14,7 +14,8 @@ def get_connection_string_head(self, params: ResolvedConnectionParams) -> str: return f"{self.url_scheme}://" def get_connection_string_from_fields(self, params: ResolvedConnectionParams) -> str: - return f"{self.url_scheme}://{params.service_account_key["project_id"] if params.service_account_key else ""}" + project_id = params.service_account_key["project_id"] if params.service_account_key else "" + return f"{self.url_scheme}://{project_id}" def get_connect_args(self, params: ResolvedConnectionParams) -> dict: # noqa: ARG002 return {} diff --git a/testgen/common/models/scores.py b/testgen/common/models/scores.py index 6eee93c3..617f3fdb 100644 --- a/testgen/common/models/scores.py +++ b/testgen/common/models/scores.py @@ -522,7 +522,7 @@ def get_as_sql( for _, field_filters in grouped_filters: field_filters_sql = [f.get_as_sql(prefix=prefix, operand="AND") for f in field_filters] filters_sql.append( - f"({" OR ".join(field_filters_sql)})" if len(field_filters_sql) > 1 else field_filters_sql[0] + f"({' OR '.join(field_filters_sql)})" if len(field_filters_sql) > 1 else field_filters_sql[0] ) else: filters_sql = [ f.get_as_sql(prefix=prefix, operand="AND") for f in self.filters ] diff --git a/testgen/ui/queries/profiling_queries.py b/testgen/ui/queries/profiling_queries.py index a0cb7873..65de8ccc 100644 --- a/testgen/ui/queries/profiling_queries.py +++ b/testgen/ui/queries/profiling_queries.py @@ -180,8 +180,7 @@ def get_tables_by_condition( include_active_tests: bool = False, include_scores: bool = False, ) -> list[dict]: - query = f""" - {""" + active_tests_cte = """ WITH active_test_definitions AS ( SELECT test_defs.table_groups_id, @@ -201,30 +200,18 @@ def get_tables_by_condition( test_defs.schema_name, test_defs.table_name ) - """ if include_active_tests else ""} - SELECT - table_chars.table_id::VARCHAR AS id, - 'table' AS type, - table_chars.table_name, - table_chars.schema_name, - table_chars.table_groups_id::VARCHAR AS table_group_id, - -- Characteristics - functional_table_type, - approx_record_ct, - table_chars.record_ct, - table_chars.column_ct, - add_date, - last_refresh_date, - drop_date, - {f""" + """ if include_active_tests else "" + + table_tags_select = f""" -- Table Tags table_chars.description, table_chars.critical_data_element, {", ".join([ f"table_chars.{tag}" for tag in TAG_FIELDS ])}, -- Table Groups Tags {", ".join([ f"table_groups.{tag} AS table_group_{tag}" for tag in TAG_FIELDS if tag != "aggregation_level" ])}, - """ if include_tags else ""} - {""" + """ if include_tags else "" + + has_test_runs_select = """ -- Has Test Runs EXISTS( SELECT 1 @@ -232,16 +219,47 @@ def get_tables_by_condition( WHERE table_groups_id = table_chars.table_groups_id AND table_name = table_chars.table_name ) AS has_test_runs, - """ if include_has_test_runs else ""} - {""" + """ if include_has_test_runs else "" + + active_tests_select = """ -- Test Definition Count active_tests.count AS active_test_count, - """ if include_active_tests else ""} - {""" + """ if include_active_tests else "" + + scores_select = """ -- Scores table_chars.dq_score_profiling, table_chars.dq_score_testing, - """ if include_scores else ""} + """ if include_scores else "" + + active_tests_join = """ + LEFT JOIN active_test_definitions active_tests ON ( + table_chars.table_groups_id = active_tests.table_groups_id + AND table_chars.schema_name = active_tests.schema_name + AND table_chars.table_name = active_tests.table_name + ) + """ if include_active_tests else "" + + query = f""" + {active_tests_cte} + SELECT + table_chars.table_id::VARCHAR AS id, + 'table' AS type, + table_chars.table_name, + table_chars.schema_name, + table_chars.table_groups_id::VARCHAR AS table_group_id, + -- Characteristics + functional_table_type, + approx_record_ct, + table_chars.record_ct, + table_chars.column_ct, + add_date, + last_refresh_date, + drop_date, + {table_tags_select} + {has_test_runs_select} + {active_tests_select} + {scores_select} -- Profile Run table_chars.last_complete_profile_run_id::VARCHAR AS profile_run_id, profiling_starttime AS profile_run_date, @@ -255,13 +273,7 @@ def get_tables_by_condition( LEFT JOIN table_groups ON ( table_chars.table_groups_id = table_groups.id ) - {""" - LEFT JOIN active_test_definitions active_tests ON ( - table_chars.table_groups_id = active_tests.table_groups_id - AND table_chars.schema_name = active_tests.schema_name - AND table_chars.table_name = active_tests.table_name - ) - """ if include_active_tests else ""} + {active_tests_join} {filter_condition} ORDER BY LOWER(table_chars.table_name); """ @@ -347,24 +359,7 @@ def get_columns_by_condition( include_active_tests: bool = False, include_scores: bool = False, ) -> list[dict]: - query = f""" - SELECT - column_chars.column_id::VARCHAR AS id, - 'column' AS type, - column_chars.column_name, - column_chars.table_name, - column_chars.schema_name, - column_chars.table_groups_id::VARCHAR AS table_group_id, - column_chars.ordinal_position, - -- Characteristics - column_chars.general_type, - column_chars.db_data_type, - column_chars.functional_data_type, - datatype_suggestion, - column_chars.add_date, - column_chars.last_mod_date, - column_chars.drop_date, - {f""" + column_tags_select = f""" -- Column Tags column_chars.description, column_chars.critical_data_element, @@ -376,13 +371,9 @@ def get_columns_by_condition( {", ".join([ f"table_chars.{tag} AS table_{tag}" for tag in TAG_FIELDS ])}, -- Table Groups Tags {", ".join([ f"table_groups.{tag} AS table_group_{tag}" for tag in TAG_FIELDS if tag != "aggregation_level" ])}, - """ if include_tags else ""} - -- Profile Run - column_chars.last_complete_profile_run_id::VARCHAR AS profile_run_id, - run_date AS profile_run_date, - TRUE AS is_latest_profile, - query_error AS profiling_error, - {""" + """ if include_tags else "" + + has_test_runs_select = """ -- Has Test Runs EXISTS( SELECT 1 @@ -391,8 +382,9 @@ def get_columns_by_condition( AND table_name = column_chars.table_name AND column_names = column_chars.column_name ) AS has_test_runs, - """ if include_has_test_runs else ""} - {""" + """ if include_has_test_runs else "" + + active_tests_select = """ -- Test Definition Count ( SELECT COUNT(*) @@ -402,12 +394,40 @@ def get_columns_by_condition( AND column_name = column_chars.column_name AND test_active = 'Y' ) AS active_test_count, - """ if include_active_tests else ""} - {""" + """ if include_active_tests else "" + + scores_select = """ -- Scores column_chars.dq_score_profiling, column_chars.dq_score_testing, - """ if include_scores else ""} + """ if include_scores else "" + + query = f""" + SELECT + column_chars.column_id::VARCHAR AS id, + 'column' AS type, + column_chars.column_name, + column_chars.table_name, + column_chars.schema_name, + column_chars.table_groups_id::VARCHAR AS table_group_id, + column_chars.ordinal_position, + -- Characteristics + column_chars.general_type, + column_chars.db_data_type, + column_chars.functional_data_type, + datatype_suggestion, + column_chars.add_date, + column_chars.last_mod_date, + column_chars.drop_date, + {column_tags_select} + -- Profile Run + column_chars.last_complete_profile_run_id::VARCHAR AS profile_run_id, + run_date AS profile_run_date, + TRUE AS is_latest_profile, + query_error AS profiling_error, + {has_test_runs_select} + {active_tests_select} + {scores_select} table_chars.approx_record_ct, table_groups.project_code, table_groups.connection_id::VARCHAR AS connection_id, diff --git a/testgen/ui/scripts/patch_streamlit.py b/testgen/ui/scripts/patch_streamlit.py index b9683003..37925626 100644 --- a/testgen/ui/scripts/patch_streamlit.py +++ b/testgen/ui/scripts/patch_streamlit.py @@ -79,7 +79,7 @@ def _create_tag(relative_filepath: str, html: BeautifulSoup) -> Tag | None: ), } - extension = f".{relative_filepath.split(".")[-1]}" + extension = f".{relative_filepath.split('.')[-1]}" if extension in tag_for_ext: return tag_for_ext[extension]() return None diff --git a/testgen/ui/views/data_catalog.py b/testgen/ui/views/data_catalog.py index d89a4680..8fe773c3 100644 --- a/testgen/ui/views/data_catalog.py +++ b/testgen/ui/views/data_catalog.py @@ -251,12 +251,13 @@ def get_excel_report_data( data["excluded_data_element"] = data["excluded_data_element"].apply(lambda val: "Yes" if val else None) data["pii_flag"] = data["pii_flag"].apply(lambda val: "Yes" if val else None) data["top_freq_values"] = data["top_freq_values"].apply( - lambda val: "\n".join([f"{part.split(" | ")[1]} | {part.split(" | ")[0]}" for part in val[2:].split("\n| ")]) + lambda val: "\n".join([f"{part.split(' | ')[1]} | {part.split(' | ')[0]}" for part in val[2:].split("\n| ")]) if not pd.isna(val) and val != PII_REDACTED else val ) + nl = "\n" # For Python 3.11 compatibility data["top_patterns"] = data["top_patterns"].apply( - lambda val: "".join([f"{part}{'\n' if index % 2 else ' | '}" for index, part in enumerate(val.split(" | "))]) + lambda val: "".join([f"{part}{nl if index % 2 else ' | '}" for index, part in enumerate(val.split(" | "))]) if not pd.isna(val) and val != PII_REDACTED else val ) diff --git a/testgen/ui/views/dialogs/table_create_script_dialog.py b/testgen/ui/views/dialogs/table_create_script_dialog.py index 1bcd386e..468a9754 100644 --- a/testgen/ui/views/dialogs/table_create_script_dialog.py +++ b/testgen/ui/views/dialogs/table_create_script_dialog.py @@ -30,7 +30,8 @@ def generate_create_script(table_name: str, data: list[dict]) -> str | None: separator = " " if index == len(table_data) - 1 else "," col_defs.append(f"{col['column_name']:<{max_name}} {(col_type):<{max_type}}{separator} {comment}") + col_defs_joined = "\n ".join(col_defs) return f""" CREATE TABLE {table_data[0]['schema_name']}.{table_data[0]['table_name']} ( - {"\n ".join(col_defs)} + {col_defs_joined} );""" diff --git a/testgen/ui/views/monitors_dashboard.py b/testgen/ui/views/monitors_dashboard.py index e19145a1..bcaedde6 100644 --- a/testgen/ui/views/monitors_dashboard.py +++ b/testgen/ui/views/monitors_dashboard.py @@ -483,9 +483,10 @@ def _monitor_changes_by_tables_query( {"OFFSET :offset" if offset else ""} """ + escaped_table_name_filter = table_name_filter.replace("_", "\\_") if table_name_filter else None params = { "table_group_id": table_group_id, - "table_name_filter": f"%{table_name_filter.replace('_', '\\_')}%" if table_name_filter else None, + "table_name_filter": f"%{escaped_table_name_filter}%" if escaped_table_name_filter else None, "sort_field": sort_field, "limit": limit, "offset": offset, diff --git a/testgen/ui/views/profiling_results.py b/testgen/ui/views/profiling_results.py index a1529f95..62368aac 100644 --- a/testgen/ui/views/profiling_results.py +++ b/testgen/ui/views/profiling_results.py @@ -225,16 +225,27 @@ def get_excel_report_data( type_map = {"A": "Alpha", "B": "Boolean", "D": "Datetime", "N": "Numeric"} data["general_type"] = data["general_type"].apply(lambda val: type_map.get(val)) - data["top_freq_values"] = data["top_freq_values"].apply( - lambda val: "\n".join([ f"{part.split(" | ")[1]} | {part.split(" | ")[0]}" for part in val[2:].split("\n| ") ]) - if val and val != PII_REDACTED - else val - ) - data["top_patterns"] = data["top_patterns"].apply( - lambda val: "".join([ f"{part}{'\n' if index % 2 else ' | '}" for index, part in enumerate(val.split(" | ")) ]) - if val and val != PII_REDACTED - else val - ) + def _format_top_freq_values(val): + if not val or val == PII_REDACTED: + return val + lines = [] + for part in val[2:].split("\n| "): + left, right = part.split(" | ") + lines.append(f"{right} | {left}") + return "\n".join(lines) + + def _format_top_patterns(val): + if not val or val == PII_REDACTED: + return val + parts = val.split(" | ") + formatted = [] + for index, part in enumerate(parts): + separator = "\n" if index % 2 else " | " + formatted.append(f"{part}{separator}") + return "".join(formatted) + + data["top_freq_values"] = data["top_freq_values"].apply(_format_top_freq_values) + data["top_patterns"] = data["top_patterns"].apply(_format_top_patterns) columns = { "table_name": {"header": "Table"}, diff --git a/testgen/ui/views/score_details.py b/testgen/ui/views/score_details.py index edc8c33c..dd98f588 100644 --- a/testgen/ui/views/score_details.py +++ b/testgen/ui/views/score_details.py @@ -196,7 +196,7 @@ def get_report_file_data(update_progress, issue) -> FILE_DATA_TYPE: update_progress(1.0) buffer.seek(0) - file_name = f"testgen_{issue["issue_type"]}_issue_report_{issue_id}_{timestamp}.pdf" + file_name = f"testgen_{issue['issue_type']}_issue_report_{issue_id}_{timestamp}.pdf" return file_name, "application/pdf", buffer.read() diff --git a/testgen/ui/views/score_explorer.py b/testgen/ui/views/score_explorer.py index 1e9352ce..4e383cf0 100644 --- a/testgen/ui/views/score_explorer.py +++ b/testgen/ui/views/score_explorer.py @@ -265,7 +265,7 @@ def get_report_file_data(update_progress, issue) -> FILE_DATA_TYPE: update_progress(1.0) buffer.seek(0) - file_name = f"testgen_{issue["issue_type"]}_issue_report_{issue_id}_{timestamp}.pdf" + file_name = f"testgen_{issue['issue_type']}_issue_report_{issue_id}_{timestamp}.pdf" return file_name, "application/pdf", buffer.read() @@ -282,15 +282,15 @@ def dialog_content() -> None: column_filters = get_column_filters(project_code) for column in column_filters: - table_group_selected = (f"table_groups_name={column["table_group"]}",) in selected_filters + table_group_selected = (f"table_groups_name={column['table_group']}",) in selected_filters table_selected = ( - f"table_groups_name={column["table_group"]}", - f"table_name={column["table"]}", + f"table_groups_name={column['table_group']}", + f"table_name={column['table']}", ) in selected_filters column_selected = ( - f"table_groups_name={column["table_group"]}", - f"table_name={column["table"]}", - f"column_name={column["name"]}", + f"table_groups_name={column['table_group']}", + f"table_name={column['table']}", + f"column_name={column['name']}", ) in selected_filters column["selected"] = table_group_selected or table_selected or column_selected diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py index ff8a1188..66c2aec0 100644 --- a/testgen/ui/views/test_results.py +++ b/testgen/ui/views/test_results.py @@ -811,7 +811,11 @@ def render_binary_chart(data: pd.DataFrame, **params: dict) -> None: history["test_start"] = history["test_date"].apply(datetime.fromisoformat) history["test_end"] = history["test_start"].apply(lambda start: start + timedelta(seconds=60)) history["formatted_test_date"] = history["test_date"].apply(lambda date_str: datetime.fromisoformat(date_str).strftime("%I:%M:%S %p, %d/%m/%Y")) - history["result_measure_with_status"] = history.apply(lambda row: f"{legend_labels[str(int(row['result_measure'])) if not pd.isnull(row['result_measure']) else "0"]} ({row['result_status']})", axis=1) + def _format_measure_with_status(row): + measure_key = str(int(row["result_measure"])) if not pd.isnull(row["result_measure"]) else "0" + return f"{legend_labels[measure_key]} ({row['result_status']})" + + history["result_measure_with_status"] = history.apply(_format_measure_with_status, axis=1) fig = px.timeline( history,