Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 157 additions & 0 deletions parser/typerecover.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
"""Recover scalar/pointer C types that parsing collapsed to ``int``.

Two distinct mechanisms erase a PG-vendored type name before the AST is built,
leaving the IDL spelling as ``int`` / ``int *`` / ``int **``:

* The host-symbol-collision build prefix-renames PG types, so ``bool`` /
``int64`` / ``Timestamp`` / ``TimestampTz`` / ``H3Index`` reach libclang
already macro-collapsed.
* ``text`` (a PG ``varlena``) is undeclared to libclang — there is no
``pg_config.h`` / ``c.h`` in the parse — so C's implicit-int rule turns
``text`` / ``text *`` / ``text **`` into ``int`` / ``int *`` / ``int **``.

Either way the real type name survives in the raw header declaration TEXT, so
this post-parse pass recovers it and rewrites the IDL entry, **preserving the
declaration's ``const`` qualifier and pointer depth**. It is idempotent and a
no-op on correctly-parsed headers: a slot is only rewritten when its current
IDL spelling is ``int`` with the *same* const/pointer shape the header would
collapse to, and the header declaration spells a recoverable base type.
Genuinely-int functions (e.g. ``intspan_width`` returning ``int``, or
``tint_values`` returning ``int *``) are left untouched because ``int`` is not
a recoverable base name.

Recovered spellings drive the downstream binding generators (JMEOS maps
``int64_t`` / ``uint64_t`` -> ``long`` and ``bool`` -> ``boolean``; MEOS.js
maps ``text *`` to a JS string via cstring2text / text2cstring; ...).
"""
import re
import glob
from pathlib import Path

# Recoverable header base type -> base spelling written into the IDL.
_TYPE_MAP = {
"bool": "bool",
"int64": "int64_t",
"Timestamp": "Timestamp",
"TimestampTz": "TimestampTz",
"H3Index": "uint64_t",
"text": "text",
"GSERIALIZED": "GSERIALIZED",
"Interval": "Interval",
"DateADT": "DateADT",
"Datum": "Datum",
"size_t": "size_t",
"GBOX": "GBOX",
"BOX3D": "BOX3D",
"AFFINE": "AFFINE",
}

_NAMES = "|".join(sorted(_TYPE_MAP, key=len, reverse=True))
# optional const, a recoverable base, optional pointer stars, optional identifier
_DECL_RE = re.compile(
rf"^(?:(?P<const>const)\s+)?(?P<base>{_NAMES})\s*(?P<stars>\**)\s*\w*$"
)


def _nospace(t):
return re.sub(r"\s+", "", t or "")


def _recovery(fragment):
"""Return ``(collapsed_idl_type, recovered_idl_type)`` for a declaration
fragment, or ``None`` when its base type is not recoverable.

'const text *txt' -> ('const int *', 'const text *')
'int64' -> ('int', 'int64_t')
'TimestampTz *' -> ('int *', 'TimestampTz *')
'text **values' -> ('int **', 'text **')
'int *count' -> None (genuine int)
"""
m = _DECL_RE.match(fragment.strip())
if not m:
return None
const = "const " if m.group("const") else ""
stars = m.group("stars") or ""
suffix = (" " + stars) if stars else ""
collapsed = f"{const}int{suffix}"
recovered = f"{const}{_TYPE_MAP[m.group('base')]}{suffix}"
return collapsed, recovered


def _parse_header_decls(headers_dir):
"""name -> (ret_recovery, [param_recovery, ...]) from the header text,
where each recovery is a ``(collapsed, recovered)`` pair or ``None``."""
decls = {}
pattern = str(Path(headers_dir) / "**" / "*.h")
for path in glob.glob(pattern, recursive=True):
txt = re.sub(r"//.*", "", open(path, errors="ignore").read())
for m in re.finditer(r"extern\s+(.+?);", txt, re.S):
d = re.sub(r"\s+", " ", m.group(1)).strip()
fm = re.match(r"(?P<ret>.+?)\b(?P<name>\w+)\s*\((?P<params>.*)\)$", d)
if not fm:
continue
# split params on top-level commas
params, depth, cur = [], 0, ""
for ch in fm.group("params"):
if ch == "(":
depth += 1
elif ch == ")":
depth -= 1
if ch == "," and depth == 0:
params.append(cur)
cur = ""
else:
cur += ch
if cur.strip():
params.append(cur)
decls[fm.group("name")] = (
_recovery(fm.group("ret")),
[_recovery(p) for p in params if p.strip()],
)
return decls


def recover_collapsed_types(idl, headers_dir):
"""Rewrite IDL function types that collapsed to int, from header text.

Returns ``(idl, stats)`` where stats counts the rewrites performed.
"""
decls = _parse_header_decls(headers_dir)
fixed = {"returns": 0, "params": 0}

def _apply(slot, recovery):
"""Rewrite a return/param slot in place; return 1 if rewritten."""
if not (recovery and isinstance(slot, dict)):
return 0
collapsed, recovered = recovery
key = "c" if "c" in slot else "cType"
if _nospace(slot.get(key)) != _nospace(collapsed):
return 0
slot[key] = recovered
if _nospace(slot.get("canonical")) == _nospace(collapsed):
slot["canonical"] = recovered
return 1

def patch(fn):
rec = decls.get(fn.get("name"))
if not rec:
return
ret_rec, param_recs = rec
fixed["returns"] += _apply(fn.get("returnType"), ret_rec)
params = fn.get("params") or []
if len(params) == len(param_recs):
for p, pr in zip(params, param_recs):
fixed["params"] += _apply(p, pr)

def walk(o):
if isinstance(o, dict):
if "name" in o and ("returnType" in o or "params" in o):
patch(o)
for v in o.values():
walk(v)
elif isinstance(o, list):
for v in o:
walk(v)

walk(idl)
return idl, fixed
9 changes: 9 additions & 0 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from parser.parser import parse_all_headers, merge_meta
from parser.portable import attach_portable_aliases
from parser.typerecover import recover_collapsed_types


HEADERS_DIR = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("./meos/include")
Expand All @@ -19,6 +20,14 @@ def main():
print(f"[1/3] Parsing {HEADERS_DIR}...", file=sys.stderr)
idl = parse_all_headers(HEADERS_DIR)

# 1b. Recover PG-vendored C types the preprocessor collapsed to int
# (bool / int64 / Timestamp(Tz) / H3Index) from the header text.
# No-op when the headers parse those types correctly.
idl, rec = recover_collapsed_types(idl, HEADERS_DIR)
if rec["returns"] or rec["params"]:
print(f" recovered {rec['returns']} return types, "
f"{rec['params']} params from collapsed int", file=sys.stderr)

# 2. Merge with manual metadata
if META_PATH.exists():
print(f"[2/3] Merging with {META_PATH}...", file=sys.stderr)
Expand Down
118 changes: 118 additions & 0 deletions tests/test_typerecover.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
"""Regression tests for parser/typerecover.py.

The recoverer rewrites IDL types that parsing collapsed to ``int`` /
``int *`` / ``int **`` back to the real type spelled in the header text,
preserving ``const`` and pointer depth. Two collapse mechanisms are covered:

* host-symbol-collision build: bool / int64 / Timestamp / TimestampTz / H3Index
* undeclared ``text`` (a PG varlena): no pg_config.h in the parse, so the
implicit-int rule turns text / text * / text ** into int / int * / int **,
silently mistyping the IDL that every downstream binding (PyMEOS-CFFI, GoMEOS,
MEOS.NET, JMEOS, MEOS.js) consumes.

These assert the recovered shapes survive and that genuinely-int functions are
left untouched. Plain unittest, no pytest dependency.

The IDL is generated, not committed; run ``python run.py`` first.

Schema note: a function's ``returnType`` is a ``{"c", "canonical"}`` dict and a
parameter is a ``{"name", "cType", "canonical"}`` dict.
"""
import json
import unittest
from pathlib import Path

IDL = Path(__file__).resolve().parents[1] / "output" / "meos-idl.json"


class TypeRecoverTests(unittest.TestCase):
def setUp(self):
if not IDL.exists():
self.skipTest(f"{IDL} not generated; run `python run.py` first")
idl = json.loads(IDL.read_text())
self.by_name = {f["name"]: f for f in idl["functions"]}

def _ret(self, name):
self.assertIn(name, self.by_name, f"{name} missing from IDL")
return self.by_name[name]["returnType"]["c"]

def _param_ctypes(self, name):
self.assertIn(name, self.by_name, f"{name} missing from IDL")
return [p["cType"] for p in self.by_name[name]["params"]]

# ---- text (the undeclared-varlena collapse) ----------------------------

def test_text_pointer_returns_recovered(self):
# Pre-fix these came back as ``int *``.
for name in ("cstring2text", "ttext_start_value", "text_copy",
"text_upper", "textset_end_value"):
self.assertEqual(self._ret(name), "text *", name)

def test_text_const_pointer_params_recovered(self):
# ``const text *`` collapses to ``const int *``.
self.assertIn("const text *", self._param_ctypes("text2cstring"))
self.assertIn("const text *", self._param_ctypes("textcat_ttext_text"))

def test_text_double_pointer_recovered(self):
# ``text **`` collapses to ``int **``.
self.assertIn("text **", self._param_ctypes("textset_make"))

def test_no_text_left_collapsed_to_int(self):
# Hard guard: a healthy IDL carries many text* slots. 0 means the
# recoverer (or its text coverage) regressed.
text_fns = [f for f in self.by_name.values()
if "text *" in json.dumps(f)]
self.assertGreater(len(text_fns), 50,
"text* collapsed toward int — typerecover regression?")

# ---- GSERIALIZED (the opaque PG geometry, collapses to int) ------------

def test_gserialized_returns_recovered(self):
# Pre-fix these geo-returning functions came back as ``int *``.
for name in ("tcbuffer_convex_hull", "tcbuffer_traversed_area",
"geo_round"):
self.assertEqual(self._ret(name), "GSERIALIZED *", name)

def test_no_gserialized_left_collapsed_to_int(self):
# Hard guard: a healthy IDL carries many GSERIALIZED* slots (geo
# accessors/constructors). 0 means GSERIALIZED recovery regressed.
geo_fns = [f for f in self.by_name.values()
if "GSERIALIZED *" in json.dumps(f)]
self.assertGreater(len(geo_fns), 50,
"GSERIALIZED* collapsed toward int — typerecover regression?")

# ---- other PG-vendored opaque types (Interval / DateADT / Datum / ...) -

def test_interval_params_recovered(self):
# ``const Interval *`` collapses to ``const int *`` (e.g. duration args).
self.assertIn("const Interval *", self._param_ctypes("temporal_tprecision"))
self.assertIn("const Interval *", self._param_ctypes("temporal_tsample"))

def test_other_vendored_pointer_types_recovered(self):
# Hard guard: each PG-vendored opaque type carries many pointer slots in
# a healthy IDL; 0 means that type's recovery regressed.
for typ, floor in (("Interval *", 30), ("DateADT", 20),
("GBOX *", 3), ("BOX3D *", 3)):
hits = [f for f in self.by_name.values() if typ in json.dumps(f)]
self.assertGreater(len(hits), floor,
f"{typ} collapsed toward int — typerecover regression?")

# ---- the host-symbol-collision collapses (incl. pointer returns) -------

def test_bool_and_pointer_returns_recovered(self):
self.assertEqual(self._ret("temporal_eq"), "bool") # scalar
self.assertEqual(self._ret("tbool_values"), "bool *") # pointer return
self.assertEqual(self._ret("temporal_timestamps"), "TimestampTz *")
self.assertEqual(self._ret("bigintset_values"), "int64_t *")
self.assertEqual(self._ret("th3index_values"), "uint64_t *")

# ---- genuine-int controls (must NOT be rewritten) ----------------------

def test_genuine_int_left_untouched(self):
# ``int`` is not a recoverable base name.
self.assertEqual(self._ret("intspan_width"), "int") # genuine scalar int
self.assertEqual(self._ret("tint_values"), "int *") # genuine int array


if __name__ == "__main__":
unittest.main()