diff --git a/parser/typerecover.py b/parser/typerecover.py new file mode 100644 index 0000000..01d8153 --- /dev/null +++ b/parser/typerecover.py @@ -0,0 +1,157 @@ +"""Recover scalar/pointer C types that parsing collapsed to ``int``. + +Two distinct mechanisms erase a PG-vendored type name before the AST is built, +leaving the IDL spelling as ``int`` / ``int *`` / ``int **``: + +* The host-symbol-collision build prefix-renames PG types, so ``bool`` / + ``int64`` / ``Timestamp`` / ``TimestampTz`` / ``H3Index`` reach libclang + already macro-collapsed. +* ``text`` (a PG ``varlena``) is undeclared to libclang — there is no + ``pg_config.h`` / ``c.h`` in the parse — so C's implicit-int rule turns + ``text`` / ``text *`` / ``text **`` into ``int`` / ``int *`` / ``int **``. + +Either way the real type name survives in the raw header declaration TEXT, so +this post-parse pass recovers it and rewrites the IDL entry, **preserving the +declaration's ``const`` qualifier and pointer depth**. It is idempotent and a +no-op on correctly-parsed headers: a slot is only rewritten when its current +IDL spelling is ``int`` with the *same* const/pointer shape the header would +collapse to, and the header declaration spells a recoverable base type. +Genuinely-int functions (e.g. ``intspan_width`` returning ``int``, or +``tint_values`` returning ``int *``) are left untouched because ``int`` is not +a recoverable base name. + +Recovered spellings drive the downstream binding generators (JMEOS maps +``int64_t`` / ``uint64_t`` -> ``long`` and ``bool`` -> ``boolean``; MEOS.js +maps ``text *`` to a JS string via cstring2text / text2cstring; ...). +""" +import re +import glob +from pathlib import Path + +# Recoverable header base type -> base spelling written into the IDL. +_TYPE_MAP = { + "bool": "bool", + "int64": "int64_t", + "Timestamp": "Timestamp", + "TimestampTz": "TimestampTz", + "H3Index": "uint64_t", + "text": "text", + "GSERIALIZED": "GSERIALIZED", + "Interval": "Interval", + "DateADT": "DateADT", + "Datum": "Datum", + "size_t": "size_t", + "GBOX": "GBOX", + "BOX3D": "BOX3D", + "AFFINE": "AFFINE", +} + +_NAMES = "|".join(sorted(_TYPE_MAP, key=len, reverse=True)) +# optional const, a recoverable base, optional pointer stars, optional identifier +_DECL_RE = re.compile( + rf"^(?:(?Pconst)\s+)?(?P{_NAMES})\s*(?P\**)\s*\w*$" +) + + +def _nospace(t): + return re.sub(r"\s+", "", t or "") + + +def _recovery(fragment): + """Return ``(collapsed_idl_type, recovered_idl_type)`` for a declaration + fragment, or ``None`` when its base type is not recoverable. + + 'const text *txt' -> ('const int *', 'const text *') + 'int64' -> ('int', 'int64_t') + 'TimestampTz *' -> ('int *', 'TimestampTz *') + 'text **values' -> ('int **', 'text **') + 'int *count' -> None (genuine int) + """ + m = _DECL_RE.match(fragment.strip()) + if not m: + return None + const = "const " if m.group("const") else "" + stars = m.group("stars") or "" + suffix = (" " + stars) if stars else "" + collapsed = f"{const}int{suffix}" + recovered = f"{const}{_TYPE_MAP[m.group('base')]}{suffix}" + return collapsed, recovered + + +def _parse_header_decls(headers_dir): + """name -> (ret_recovery, [param_recovery, ...]) from the header text, + where each recovery is a ``(collapsed, recovered)`` pair or ``None``.""" + decls = {} + pattern = str(Path(headers_dir) / "**" / "*.h") + for path in glob.glob(pattern, recursive=True): + txt = re.sub(r"//.*", "", open(path, errors="ignore").read()) + for m in re.finditer(r"extern\s+(.+?);", txt, re.S): + d = re.sub(r"\s+", " ", m.group(1)).strip() + fm = re.match(r"(?P.+?)\b(?P\w+)\s*\((?P.*)\)$", d) + if not fm: + continue + # split params on top-level commas + params, depth, cur = [], 0, "" + for ch in fm.group("params"): + if ch == "(": + depth += 1 + elif ch == ")": + depth -= 1 + if ch == "," and depth == 0: + params.append(cur) + cur = "" + else: + cur += ch + if cur.strip(): + params.append(cur) + decls[fm.group("name")] = ( + _recovery(fm.group("ret")), + [_recovery(p) for p in params if p.strip()], + ) + return decls + + +def recover_collapsed_types(idl, headers_dir): + """Rewrite IDL function types that collapsed to int, from header text. + + Returns ``(idl, stats)`` where stats counts the rewrites performed. + """ + decls = _parse_header_decls(headers_dir) + fixed = {"returns": 0, "params": 0} + + def _apply(slot, recovery): + """Rewrite a return/param slot in place; return 1 if rewritten.""" + if not (recovery and isinstance(slot, dict)): + return 0 + collapsed, recovered = recovery + key = "c" if "c" in slot else "cType" + if _nospace(slot.get(key)) != _nospace(collapsed): + return 0 + slot[key] = recovered + if _nospace(slot.get("canonical")) == _nospace(collapsed): + slot["canonical"] = recovered + return 1 + + def patch(fn): + rec = decls.get(fn.get("name")) + if not rec: + return + ret_rec, param_recs = rec + fixed["returns"] += _apply(fn.get("returnType"), ret_rec) + params = fn.get("params") or [] + if len(params) == len(param_recs): + for p, pr in zip(params, param_recs): + fixed["params"] += _apply(p, pr) + + def walk(o): + if isinstance(o, dict): + if "name" in o and ("returnType" in o or "params" in o): + patch(o) + for v in o.values(): + walk(v) + elif isinstance(o, list): + for v in o: + walk(v) + + walk(idl) + return idl, fixed diff --git a/run.py b/run.py index 8b505dd..640dd9e 100644 --- a/run.py +++ b/run.py @@ -4,6 +4,7 @@ from parser.parser import parse_all_headers, merge_meta from parser.portable import attach_portable_aliases +from parser.typerecover import recover_collapsed_types HEADERS_DIR = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("./meos/include") @@ -19,6 +20,14 @@ def main(): print(f"[1/3] Parsing {HEADERS_DIR}...", file=sys.stderr) idl = parse_all_headers(HEADERS_DIR) + # 1b. Recover PG-vendored C types the preprocessor collapsed to int + # (bool / int64 / Timestamp(Tz) / H3Index) from the header text. + # No-op when the headers parse those types correctly. + idl, rec = recover_collapsed_types(idl, HEADERS_DIR) + if rec["returns"] or rec["params"]: + print(f" recovered {rec['returns']} return types, " + f"{rec['params']} params from collapsed int", file=sys.stderr) + # 2. Merge with manual metadata if META_PATH.exists(): print(f"[2/3] Merging with {META_PATH}...", file=sys.stderr) diff --git a/tests/test_typerecover.py b/tests/test_typerecover.py new file mode 100644 index 0000000..ed10993 --- /dev/null +++ b/tests/test_typerecover.py @@ -0,0 +1,118 @@ +"""Regression tests for parser/typerecover.py. + +The recoverer rewrites IDL types that parsing collapsed to ``int`` / +``int *`` / ``int **`` back to the real type spelled in the header text, +preserving ``const`` and pointer depth. Two collapse mechanisms are covered: + +* host-symbol-collision build: bool / int64 / Timestamp / TimestampTz / H3Index +* undeclared ``text`` (a PG varlena): no pg_config.h in the parse, so the + implicit-int rule turns text / text * / text ** into int / int * / int **, + silently mistyping the IDL that every downstream binding (PyMEOS-CFFI, GoMEOS, + MEOS.NET, JMEOS, MEOS.js) consumes. + +These assert the recovered shapes survive and that genuinely-int functions are +left untouched. Plain unittest, no pytest dependency. + +The IDL is generated, not committed; run ``python run.py`` first. + +Schema note: a function's ``returnType`` is a ``{"c", "canonical"}`` dict and a +parameter is a ``{"name", "cType", "canonical"}`` dict. +""" +import json +import unittest +from pathlib import Path + +IDL = Path(__file__).resolve().parents[1] / "output" / "meos-idl.json" + + +class TypeRecoverTests(unittest.TestCase): + def setUp(self): + if not IDL.exists(): + self.skipTest(f"{IDL} not generated; run `python run.py` first") + idl = json.loads(IDL.read_text()) + self.by_name = {f["name"]: f for f in idl["functions"]} + + def _ret(self, name): + self.assertIn(name, self.by_name, f"{name} missing from IDL") + return self.by_name[name]["returnType"]["c"] + + def _param_ctypes(self, name): + self.assertIn(name, self.by_name, f"{name} missing from IDL") + return [p["cType"] for p in self.by_name[name]["params"]] + + # ---- text (the undeclared-varlena collapse) ---------------------------- + + def test_text_pointer_returns_recovered(self): + # Pre-fix these came back as ``int *``. + for name in ("cstring2text", "ttext_start_value", "text_copy", + "text_upper", "textset_end_value"): + self.assertEqual(self._ret(name), "text *", name) + + def test_text_const_pointer_params_recovered(self): + # ``const text *`` collapses to ``const int *``. + self.assertIn("const text *", self._param_ctypes("text2cstring")) + self.assertIn("const text *", self._param_ctypes("textcat_ttext_text")) + + def test_text_double_pointer_recovered(self): + # ``text **`` collapses to ``int **``. + self.assertIn("text **", self._param_ctypes("textset_make")) + + def test_no_text_left_collapsed_to_int(self): + # Hard guard: a healthy IDL carries many text* slots. 0 means the + # recoverer (or its text coverage) regressed. + text_fns = [f for f in self.by_name.values() + if "text *" in json.dumps(f)] + self.assertGreater(len(text_fns), 50, + "text* collapsed toward int — typerecover regression?") + + # ---- GSERIALIZED (the opaque PG geometry, collapses to int) ------------ + + def test_gserialized_returns_recovered(self): + # Pre-fix these geo-returning functions came back as ``int *``. + for name in ("tcbuffer_convex_hull", "tcbuffer_traversed_area", + "geo_round"): + self.assertEqual(self._ret(name), "GSERIALIZED *", name) + + def test_no_gserialized_left_collapsed_to_int(self): + # Hard guard: a healthy IDL carries many GSERIALIZED* slots (geo + # accessors/constructors). 0 means GSERIALIZED recovery regressed. + geo_fns = [f for f in self.by_name.values() + if "GSERIALIZED *" in json.dumps(f)] + self.assertGreater(len(geo_fns), 50, + "GSERIALIZED* collapsed toward int — typerecover regression?") + + # ---- other PG-vendored opaque types (Interval / DateADT / Datum / ...) - + + def test_interval_params_recovered(self): + # ``const Interval *`` collapses to ``const int *`` (e.g. duration args). + self.assertIn("const Interval *", self._param_ctypes("temporal_tprecision")) + self.assertIn("const Interval *", self._param_ctypes("temporal_tsample")) + + def test_other_vendored_pointer_types_recovered(self): + # Hard guard: each PG-vendored opaque type carries many pointer slots in + # a healthy IDL; 0 means that type's recovery regressed. + for typ, floor in (("Interval *", 30), ("DateADT", 20), + ("GBOX *", 3), ("BOX3D *", 3)): + hits = [f for f in self.by_name.values() if typ in json.dumps(f)] + self.assertGreater(len(hits), floor, + f"{typ} collapsed toward int — typerecover regression?") + + # ---- the host-symbol-collision collapses (incl. pointer returns) ------- + + def test_bool_and_pointer_returns_recovered(self): + self.assertEqual(self._ret("temporal_eq"), "bool") # scalar + self.assertEqual(self._ret("tbool_values"), "bool *") # pointer return + self.assertEqual(self._ret("temporal_timestamps"), "TimestampTz *") + self.assertEqual(self._ret("bigintset_values"), "int64_t *") + self.assertEqual(self._ret("th3index_values"), "uint64_t *") + + # ---- genuine-int controls (must NOT be rewritten) ---------------------- + + def test_genuine_int_left_untouched(self): + # ``int`` is not a recoverable base name. + self.assertEqual(self._ret("intspan_width"), "int") # genuine scalar int + self.assertEqual(self._ret("tint_values"), "int *") # genuine int array + + +if __name__ == "__main__": + unittest.main()