diff --git a/parser/nullable.py b/parser/nullable.py new file mode 100644 index 0000000..41c411c --- /dev/null +++ b/parser/nullable.py @@ -0,0 +1,68 @@ +"""Extract per-parameter nullability from the MEOS C Doxygen as the SoT. + +A MEOS function parameter accepts NULL iff its Doxygen ``@param`` line says so, +e.g. ``@param[in] srs Spatial reference system, may be `NULL```. This is the +single source of truth the codegens consume — grounded in the C code, keyed by +parameter name, and cross-checked in MobilityDB against the PG layer (a SQL +function declared without ``STRICT`` + the wrapper's ``PG_ARGISNULL`` guards). + +The extractor walks the MEOS sources, pairs each Doxygen block with the function +it documents, and records the params whose description carries a NULL note. The +result feeds ``shape.nullable`` in the IDL so every binding can guard the param. +""" +from __future__ import annotations + +import glob +import re +from pathlib import Path + +# Doxygen block immediately followed by a function definition (``name(...) {``). +_FUNC = re.compile( + r'/\*\*(?P.*?)\*/\s*\n' + r'(?:[A-Za-z_][\w\s\*]*?\n)?' # optional return-type line + r'(?P[a-z][a-z0-9_]*)\s*\(' + r'(?P[^;{]*?)\)\s*\{', + re.S) +# One @param entry: capture the (possibly comma-separated) names + description. +_PARAM = re.compile( + r'@param\[[^\]]*\]\s+(?P\w+(?:\s*,\s*\w+)*)\s+(?P.*?)' + r'(?=\n\s*\*\s*@|\*/|\Z)', re.S) +_NULLISH = re.compile(r'may be\s+`?NULL`?|can be\s+`?NULL`?|`?NULL`?\s+is allowed' + r'|or\s+`?NULL`?', re.I) + + +def extract_nullable(meos_root: str | Path) -> dict[str, list[str]]: + """Return ``{function: [nullable params]}`` from the MEOS C sources under + ``meos_root`` (scans both ``src`` and ``include``).""" + root = Path(meos_root) + out: dict[str, list[str]] = {} + files = glob.glob(str(root / "src/**/*.c"), recursive=True) + files += glob.glob(str(root / "include/**/*.h"), recursive=True) + for f in files: + txt = Path(f).read_text(errors="ignore") + for m in _FUNC.finditer(txt): + name = m.group("name") + for pm in _PARAM.finditer(m.group("doc")): + if not _NULLISH.search(pm.group("desc")): + continue + for p in (n.strip() for n in pm.group("names").split(",")): + out.setdefault(name, []) + if p and p not in out[name]: + out[name].append(p) + return out + + +def merge_nullable(idl: dict, meos_root: str | Path) -> tuple[dict, int]: + """Fold the extracted nullability into each function's ``shape.nullable``.""" + nul = extract_nullable(meos_root) + n = 0 + for func in idl["functions"]: + params = nul.get(func["name"]) + if not params: + continue + present = {p["name"] for p in func.get("params", [])} + keep = [p for p in params if p in present] + if keep: + func.setdefault("shape", {})["nullable"] = keep + n += len(keep) + return idl, n diff --git a/parser/shapeinfer.py b/parser/shapeinfer.py new file mode 100644 index 0000000..598372c --- /dev/null +++ b/parser/shapeinfer.py @@ -0,0 +1,72 @@ +"""Infer per-function output-array *shape* from the C signatures. + +MEOS array-returning functions follow one fixed convention, so the shape the +codegens need is fully derivable from the headers — no hand-maintained table: + + TYPE *f(..., int *count) -> returns an array of ``count`` + TYPE **f(..., TYPE **extra, int *count) -> primary array return PLUS one + or more parallel out-arrays + +The output length is always passed *by pointer* (``int *count``); an *input* +array instead carries its length *by value* (``int count``). That pointer/value +distinction is exactly how a written-back out-array is told apart from a +read-only in-array — e.g. ``temporal_time_split(..., TimestampTz **time_bins, +int *count)`` (out) versus ``tsequence_make(const TInstant **instants, int +count, ...)`` (in). + +This replaces the ``meta/meos-meta.json`` shape entries, which had drifted to a +3-function stub and silently mis-classified every out-array as an input +parameter, breaking the split / space-split / mvtgeom / normalize families in +every binding generated from the IDL. +""" +from __future__ import annotations + + +# Parameters that accept NULL by MEOS convention regardless of the function. +# ``srs`` is the optional spatial-reference string of every ``*_as_*json`` / +# text output function — passing NULL means "no CRS". Nullability is otherwise +# semantic (not signature-derivable), so this stays a narrow, named convention +# rather than a blanket rule; extend only when a binding's tests prove a param +# is passed None. +_NULLABLE_BY_CONVENTION = {"srs"} + + +def _out_count_param(func: dict) -> str | None: + """Return the name of the by-pointer output count param, if the function + has one. This is the marker that the function returns array(s).""" + for p in func.get("params", []): + if p["name"] == "count" and p.get("cType", "").strip() == "int *": + return p["name"] + return None + + +def _is_written_back_array(p: dict) -> bool: + """A non-const double (or higher) pointer parameter the callee allocates + and writes back, i.e. a parallel output array.""" + ct = p.get("cType", "") + return "**" in ct and not ct.lstrip().startswith("const") + + +def infer_shapes(idl: dict) -> tuple[dict, dict]: + """Populate ``func['shape']`` with ``arrayReturn``/``outputArrays`` derived + from the signatures. Returns ``(idl, stats)``. Idempotent and additive: + only the array-output families are touched, everything else is untouched.""" + n_arr = n_oa = 0 + for func in idl["functions"]: + count = _out_count_param(func) + if not count: + continue # not array-returning; nothing to infer + shape = func.setdefault("shape", {}) + # The primary pointer return takes its length from the output count. + ret = func.get("returnType", {}).get("c", "") + if ret.rstrip().endswith("*"): + shape.setdefault("arrayReturn", {})["lengthFrom"] = { + "kind": "param", "name": count} + n_arr += 1 + # Parallel written-back out-arrays (``TYPE **extra`` alongside count). + out = [{"param": p["name"]} for p in func["params"] + if p["name"] != count and _is_written_back_array(p)] + if out: + shape["outputArrays"] = out + n_oa += len(out) + return idl, {"arrayReturn": n_arr, "outputArrays": n_oa} diff --git a/run.py b/run.py index 8b505dd..b2bc228 100644 --- a/run.py +++ b/run.py @@ -4,6 +4,8 @@ from parser.parser import parse_all_headers, merge_meta from parser.portable import attach_portable_aliases +from parser.shapeinfer import infer_shapes +from parser.nullable import merge_nullable HEADERS_DIR = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("./meos/include") @@ -19,6 +21,16 @@ def main(): print(f"[1/3] Parsing {HEADERS_DIR}...", file=sys.stderr) idl = parse_all_headers(HEADERS_DIR) + # 1b. Generate the codegen `shape` from the signatures + Doxygen, replacing + # the hand-maintained meta stub. outputArrays/arrayReturn come from the + # parameter forms; nullable comes from the C `@param ... may be NULL` SoT. + idl, sh = infer_shapes(idl) + print(f" inferred shape: {sh['arrayReturn']} array returns, " + f"{sh['outputArrays']} output arrays", file=sys.stderr) + idl, nn = merge_nullable(idl, HEADERS_DIR.parent) + print(f" nullable params from Doxygen `may be NULL`: {nn}", + file=sys.stderr) + # 2. Merge with manual metadata if META_PATH.exists(): print(f"[2/3] Merging with {META_PATH}...", file=sys.stderr) diff --git a/tests/test_nullable.py b/tests/test_nullable.py new file mode 100644 index 0000000..0d37bc4 --- /dev/null +++ b/tests/test_nullable.py @@ -0,0 +1,75 @@ +"""Regression tests for parser/nullable.py. + +Nullability is read from the C Doxygen `@param ... may be NULL` notes (the +source of truth) and folded into each function's ``shape.nullable`` for the +params that actually exist on the IDL function. + +Plain unittest, no pytest dependency; writes a tiny synthetic source tree. +""" +import tempfile +import unittest +from pathlib import Path + +from parser.nullable import extract_nullable, merge_nullable + +SAMPLE = ''' +/** + * @ingroup meos_temporal_inout + * @brief Return the MF-JSON representation + * @param[in] temp Temporal value + * @param[in] srs Spatial reference system, may be `NULL` + */ +char * +temporal_as_mfjson(const Temporal *temp, char *srs) +{ + return NULL; +} + +/** + * @brief Append an instant + * @param[in] inst Instant + * @param[in] maxt Maximum time interval, may be `NULL` + * @param[in] interp Interpolation + */ +Temporal * +temporal_append_tinstant(const TInstant *inst, const Interval *maxt, int interp) +{ + return NULL; +} +''' + + +class NullableTests(unittest.TestCase): + def setUp(self): + self.tmp = tempfile.TemporaryDirectory() + src = Path(self.tmp.name) / "src" + src.mkdir() + (src / "sample.c").write_text(SAMPLE) + (Path(self.tmp.name) / "include").mkdir() + + def tearDown(self): + self.tmp.cleanup() + + def test_extracts_only_may_be_null_params(self): + nul = extract_nullable(self.tmp.name) + self.assertEqual(nul["temporal_as_mfjson"], ["srs"]) + self.assertEqual(nul["temporal_append_tinstant"], ["maxt"]) + # `temp`, `inst`, `interp` carry no NULL note -> not nullable + self.assertNotIn("temp", nul["temporal_as_mfjson"]) + + def test_merge_only_existing_params(self): + idl = {"functions": [ + {"name": "temporal_as_mfjson", + "params": [{"name": "temp"}, {"name": "srs"}]}, + # function whose nullable param is NOT in its IDL signature + {"name": "temporal_append_tinstant", "params": [{"name": "inst"}]}, + ]} + idl, n = merge_nullable(idl, self.tmp.name) + self.assertEqual(idl["functions"][0]["shape"]["nullable"], ["srs"]) + # maxt absent from the IDL params -> not added + self.assertNotIn("shape", idl["functions"][1]) + self.assertEqual(n, 1) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_shapeinfer.py b/tests/test_shapeinfer.py new file mode 100644 index 0000000..592cd41 --- /dev/null +++ b/tests/test_shapeinfer.py @@ -0,0 +1,70 @@ +"""Regression tests for parser/shapeinfer.py. + +The inferer derives array-output shape from the C signatures, replacing the +hand-maintained meta stub. The discriminator is the *count* parameter's form: + +* a written-back out-array pairs with a by-pointer ``int *count`` (the callee + fills the length) -> ``outputArrays`` + ``arrayReturn.lengthFrom`` +* a read-only in-array pairs with a by-value ``int count`` -> left untouched + +Plain unittest, no pytest dependency; fully synthetic IDL (no generated file). +""" +import unittest + +from parser.shapeinfer import infer_shapes + + +def _fn(name, ret, params): + return {"name": name, + "returnType": {"c": ret, "canonical": ret}, + "params": [{"name": n, "cType": t, "canonical": t} for n, t in params]} + + +class ShapeInferTests(unittest.TestCase): + def test_output_array_with_pointer_count(self): + # temporal_time_split-style: non-const ** out-array + by-pointer count + idl = {"functions": [_fn( + "temporal_time_split", "Temporal **", + [("temp", "const Temporal *"), ("duration", "const Interval *"), + ("torigin", "TimestampTz"), ("time_bins", "TimestampTz **"), + ("count", "int *")])]} + idl, stats = infer_shapes(idl) + sh = idl["functions"][0]["shape"] + self.assertEqual(sh["outputArrays"], [{"param": "time_bins"}]) + self.assertEqual(sh["arrayReturn"]["lengthFrom"], + {"kind": "param", "name": "count"}) + self.assertEqual(stats["outputArrays"], 1) + + def test_two_parallel_output_arrays(self): + idl = {"functions": [_fn( + "tfloat_value_time_split", "Temporal **", + [("temp", "const Temporal *"), ("vsize", "double"), + ("value_bins", "double **"), ("time_bins", "TimestampTz **"), + ("count", "int *")])]} + idl, _ = infer_shapes(idl) + self.assertEqual(idl["functions"][0]["shape"]["outputArrays"], + [{"param": "value_bins"}, {"param": "time_bins"}]) + + def test_input_array_with_value_count_untouched(self): + # tsequence_make-style: ** input array carries its length BY VALUE + idl = {"functions": [_fn( + "tsequence_make", "TSequence *", + [("instants", "const TInstant **"), ("count", "int"), + ("lower_inc", "bool")])]} + idl, stats = infer_shapes(idl) + self.assertNotIn("shape", idl["functions"][0]) + self.assertEqual(stats["outputArrays"], 0) + + def test_nonconst_input_array_with_value_count_untouched(self): + # tsequenceset_make_gaps-style: non-const ** but BY-VALUE count => input + idl = {"functions": [_fn( + "tsequenceset_make_gaps", "TSequenceSet *", + [("instants", "TInstant **"), ("count", "int"), + ("maxt", "const Interval *")])]} + idl, stats = infer_shapes(idl) + self.assertEqual(stats["outputArrays"], 0) + self.assertNotIn("shape", idl["functions"][0]) + + +if __name__ == "__main__": + unittest.main()