Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions parser/nullable.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""Extract per-parameter nullability from the MEOS C Doxygen as the SoT.

A MEOS function parameter accepts NULL iff its Doxygen ``@param`` line says so,
e.g. ``@param[in] srs Spatial reference system, may be `NULL```. This is the
single source of truth the codegens consume — grounded in the C code, keyed by
parameter name, and cross-checked in MobilityDB against the PG layer (a SQL
function declared without ``STRICT`` + the wrapper's ``PG_ARGISNULL`` guards).

The extractor walks the MEOS sources, pairs each Doxygen block with the function
it documents, and records the params whose description carries a NULL note. The
result feeds ``shape.nullable`` in the IDL so every binding can guard the param.
"""
from __future__ import annotations

import glob
import re
from pathlib import Path

# Doxygen block immediately followed by a function definition (``name(...) {``).
_FUNC = re.compile(
r'/\*\*(?P<doc>.*?)\*/\s*\n'
r'(?:[A-Za-z_][\w\s\*]*?\n)?' # optional return-type line
r'(?P<name>[a-z][a-z0-9_]*)\s*\('
r'(?P<params>[^;{]*?)\)\s*\{',
re.S)
# One @param entry: capture the (possibly comma-separated) names + description.
_PARAM = re.compile(
r'@param\[[^\]]*\]\s+(?P<names>\w+(?:\s*,\s*\w+)*)\s+(?P<desc>.*?)'
r'(?=\n\s*\*\s*@|\*/|\Z)', re.S)
_NULLISH = re.compile(r'may be\s+`?NULL`?|can be\s+`?NULL`?|`?NULL`?\s+is allowed'
r'|or\s+`?NULL`?', re.I)


def extract_nullable(meos_root: str | Path) -> dict[str, list[str]]:
"""Return ``{function: [nullable params]}`` from the MEOS C sources under
``meos_root`` (scans both ``src`` and ``include``)."""
root = Path(meos_root)
out: dict[str, list[str]] = {}
files = glob.glob(str(root / "src/**/*.c"), recursive=True)
files += glob.glob(str(root / "include/**/*.h"), recursive=True)
for f in files:
txt = Path(f).read_text(errors="ignore")
for m in _FUNC.finditer(txt):
name = m.group("name")
for pm in _PARAM.finditer(m.group("doc")):
if not _NULLISH.search(pm.group("desc")):
continue
for p in (n.strip() for n in pm.group("names").split(",")):
out.setdefault(name, [])
if p and p not in out[name]:
out[name].append(p)
return out


def merge_nullable(idl: dict, meos_root: str | Path) -> tuple[dict, int]:
"""Fold the extracted nullability into each function's ``shape.nullable``."""
nul = extract_nullable(meos_root)
n = 0
for func in idl["functions"]:
params = nul.get(func["name"])
if not params:
continue
present = {p["name"] for p in func.get("params", [])}
keep = [p for p in params if p in present]
if keep:
func.setdefault("shape", {})["nullable"] = keep
n += len(keep)
return idl, n
72 changes: 72 additions & 0 deletions parser/shapeinfer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""Infer per-function output-array *shape* from the C signatures.

MEOS array-returning functions follow one fixed convention, so the shape the
codegens need is fully derivable from the headers — no hand-maintained table:

TYPE *f(..., int *count) -> returns an array of ``count``
TYPE **f(..., TYPE **extra, int *count) -> primary array return PLUS one
or more parallel out-arrays

The output length is always passed *by pointer* (``int *count``); an *input*
array instead carries its length *by value* (``int count``). That pointer/value
distinction is exactly how a written-back out-array is told apart from a
read-only in-array — e.g. ``temporal_time_split(..., TimestampTz **time_bins,
int *count)`` (out) versus ``tsequence_make(const TInstant **instants, int
count, ...)`` (in).

This replaces the ``meta/meos-meta.json`` shape entries, which had drifted to a
3-function stub and silently mis-classified every out-array as an input
parameter, breaking the split / space-split / mvtgeom / normalize families in
every binding generated from the IDL.
"""
from __future__ import annotations


# Parameters that accept NULL by MEOS convention regardless of the function.
# ``srs`` is the optional spatial-reference string of every ``*_as_*json`` /
# text output function — passing NULL means "no CRS". Nullability is otherwise
# semantic (not signature-derivable), so this stays a narrow, named convention
# rather than a blanket rule; extend only when a binding's tests prove a param
# is passed None.
_NULLABLE_BY_CONVENTION = {"srs"}


def _out_count_param(func: dict) -> str | None:
"""Return the name of the by-pointer output count param, if the function
has one. This is the marker that the function returns array(s)."""
for p in func.get("params", []):
if p["name"] == "count" and p.get("cType", "").strip() == "int *":
return p["name"]
return None


def _is_written_back_array(p: dict) -> bool:
"""A non-const double (or higher) pointer parameter the callee allocates
and writes back, i.e. a parallel output array."""
ct = p.get("cType", "")
return "**" in ct and not ct.lstrip().startswith("const")


def infer_shapes(idl: dict) -> tuple[dict, dict]:
"""Populate ``func['shape']`` with ``arrayReturn``/``outputArrays`` derived
from the signatures. Returns ``(idl, stats)``. Idempotent and additive:
only the array-output families are touched, everything else is untouched."""
n_arr = n_oa = 0
for func in idl["functions"]:
count = _out_count_param(func)
if not count:
continue # not array-returning; nothing to infer
shape = func.setdefault("shape", {})
# The primary pointer return takes its length from the output count.
ret = func.get("returnType", {}).get("c", "")
if ret.rstrip().endswith("*"):
shape.setdefault("arrayReturn", {})["lengthFrom"] = {
"kind": "param", "name": count}
n_arr += 1
# Parallel written-back out-arrays (``TYPE **extra`` alongside count).
out = [{"param": p["name"]} for p in func["params"]
if p["name"] != count and _is_written_back_array(p)]
if out:
shape["outputArrays"] = out
n_oa += len(out)
return idl, {"arrayReturn": n_arr, "outputArrays": n_oa}
12 changes: 12 additions & 0 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

from parser.parser import parse_all_headers, merge_meta
from parser.portable import attach_portable_aliases
from parser.shapeinfer import infer_shapes
from parser.nullable import merge_nullable


HEADERS_DIR = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("./meos/include")
Expand All @@ -19,6 +21,16 @@ def main():
print(f"[1/3] Parsing {HEADERS_DIR}...", file=sys.stderr)
idl = parse_all_headers(HEADERS_DIR)

# 1b. Generate the codegen `shape` from the signatures + Doxygen, replacing
# the hand-maintained meta stub. outputArrays/arrayReturn come from the
# parameter forms; nullable comes from the C `@param ... may be NULL` SoT.
idl, sh = infer_shapes(idl)
print(f" inferred shape: {sh['arrayReturn']} array returns, "
f"{sh['outputArrays']} output arrays", file=sys.stderr)
idl, nn = merge_nullable(idl, HEADERS_DIR.parent)
print(f" nullable params from Doxygen `may be NULL`: {nn}",
file=sys.stderr)

# 2. Merge with manual metadata
if META_PATH.exists():
print(f"[2/3] Merging with {META_PATH}...", file=sys.stderr)
Expand Down
75 changes: 75 additions & 0 deletions tests/test_nullable.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""Regression tests for parser/nullable.py.

Nullability is read from the C Doxygen `@param ... may be NULL` notes (the
source of truth) and folded into each function's ``shape.nullable`` for the
params that actually exist on the IDL function.

Plain unittest, no pytest dependency; writes a tiny synthetic source tree.
"""
import tempfile
import unittest
from pathlib import Path

from parser.nullable import extract_nullable, merge_nullable

SAMPLE = '''
/**
* @ingroup meos_temporal_inout
* @brief Return the MF-JSON representation
* @param[in] temp Temporal value
* @param[in] srs Spatial reference system, may be `NULL`
*/
char *
temporal_as_mfjson(const Temporal *temp, char *srs)
{
return NULL;
}

/**
* @brief Append an instant
* @param[in] inst Instant
* @param[in] maxt Maximum time interval, may be `NULL`
* @param[in] interp Interpolation
*/
Temporal *
temporal_append_tinstant(const TInstant *inst, const Interval *maxt, int interp)
{
return NULL;
}
'''


class NullableTests(unittest.TestCase):
def setUp(self):
self.tmp = tempfile.TemporaryDirectory()
src = Path(self.tmp.name) / "src"
src.mkdir()
(src / "sample.c").write_text(SAMPLE)
(Path(self.tmp.name) / "include").mkdir()

def tearDown(self):
self.tmp.cleanup()

def test_extracts_only_may_be_null_params(self):
nul = extract_nullable(self.tmp.name)
self.assertEqual(nul["temporal_as_mfjson"], ["srs"])
self.assertEqual(nul["temporal_append_tinstant"], ["maxt"])
# `temp`, `inst`, `interp` carry no NULL note -> not nullable
self.assertNotIn("temp", nul["temporal_as_mfjson"])

def test_merge_only_existing_params(self):
idl = {"functions": [
{"name": "temporal_as_mfjson",
"params": [{"name": "temp"}, {"name": "srs"}]},
# function whose nullable param is NOT in its IDL signature
{"name": "temporal_append_tinstant", "params": [{"name": "inst"}]},
]}
idl, n = merge_nullable(idl, self.tmp.name)
self.assertEqual(idl["functions"][0]["shape"]["nullable"], ["srs"])
# maxt absent from the IDL params -> not added
self.assertNotIn("shape", idl["functions"][1])
self.assertEqual(n, 1)


if __name__ == "__main__":
unittest.main()
70 changes: 70 additions & 0 deletions tests/test_shapeinfer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""Regression tests for parser/shapeinfer.py.

The inferer derives array-output shape from the C signatures, replacing the
hand-maintained meta stub. The discriminator is the *count* parameter's form:

* a written-back out-array pairs with a by-pointer ``int *count`` (the callee
fills the length) -> ``outputArrays`` + ``arrayReturn.lengthFrom``
* a read-only in-array pairs with a by-value ``int count`` -> left untouched

Plain unittest, no pytest dependency; fully synthetic IDL (no generated file).
"""
import unittest

from parser.shapeinfer import infer_shapes


def _fn(name, ret, params):
return {"name": name,
"returnType": {"c": ret, "canonical": ret},
"params": [{"name": n, "cType": t, "canonical": t} for n, t in params]}


class ShapeInferTests(unittest.TestCase):
def test_output_array_with_pointer_count(self):
# temporal_time_split-style: non-const ** out-array + by-pointer count
idl = {"functions": [_fn(
"temporal_time_split", "Temporal **",
[("temp", "const Temporal *"), ("duration", "const Interval *"),
("torigin", "TimestampTz"), ("time_bins", "TimestampTz **"),
("count", "int *")])]}
idl, stats = infer_shapes(idl)
sh = idl["functions"][0]["shape"]
self.assertEqual(sh["outputArrays"], [{"param": "time_bins"}])
self.assertEqual(sh["arrayReturn"]["lengthFrom"],
{"kind": "param", "name": "count"})
self.assertEqual(stats["outputArrays"], 1)

def test_two_parallel_output_arrays(self):
idl = {"functions": [_fn(
"tfloat_value_time_split", "Temporal **",
[("temp", "const Temporal *"), ("vsize", "double"),
("value_bins", "double **"), ("time_bins", "TimestampTz **"),
("count", "int *")])]}
idl, _ = infer_shapes(idl)
self.assertEqual(idl["functions"][0]["shape"]["outputArrays"],
[{"param": "value_bins"}, {"param": "time_bins"}])

def test_input_array_with_value_count_untouched(self):
# tsequence_make-style: ** input array carries its length BY VALUE
idl = {"functions": [_fn(
"tsequence_make", "TSequence *",
[("instants", "const TInstant **"), ("count", "int"),
("lower_inc", "bool")])]}
idl, stats = infer_shapes(idl)
self.assertNotIn("shape", idl["functions"][0])
self.assertEqual(stats["outputArrays"], 0)

def test_nonconst_input_array_with_value_count_untouched(self):
# tsequenceset_make_gaps-style: non-const ** but BY-VALUE count => input
idl = {"functions": [_fn(
"tsequenceset_make_gaps", "TSequenceSet *",
[("instants", "TInstant **"), ("count", "int"),
("maxt", "const Interval *")])]}
idl, stats = infer_shapes(idl)
self.assertEqual(stats["outputArrays"], 0)
self.assertNotIn("shape", idl["functions"][0])


if __name__ == "__main__":
unittest.main()