diff --git a/contrib/btree_gist/btree_utils_var.c b/contrib/btree_gist/btree_utils_var.c index 0c0e952f736..63137c5c85c 100644 --- a/contrib/btree_gist/btree_utils_var.c +++ b/contrib/btree_gist/btree_utils_var.c @@ -116,36 +116,47 @@ gbt_var_leaf2node(GBT_VARKEY *leaf, const gbtree_vinfo *tinfo, FmgrInfo *flinfo) /* * returns the common prefix length of a node key + * + * If the underlying type is character data, the prefix length may point in + * the middle of a multibyte character. */ static int32 gbt_var_node_cp_len(const GBT_VARKEY *node, const gbtree_vinfo *tinfo) { GBT_VARKEY_R r = gbt_var_key_readable(node); int32 i = 0; - int32 l = 0; + int32 l_left_to_match = 0; + int32 l_total = 0; int32 t1len = VARSIZE(r.lower) - VARHDRSZ; int32 t2len = VARSIZE(r.upper) - VARHDRSZ; int32 ml = Min(t1len, t2len); char *p1 = VARDATA(r.lower); char *p2 = VARDATA(r.upper); + const char *end1 = p1 + t1len; + const char *end2 = p2 + t2len; if (ml == 0) return 0; while (i < ml) { - if (tinfo->eml > 1 && l == 0) + if (tinfo->eml > 1 && l_left_to_match == 0) { - if ((l = pg_mblen(p1)) != pg_mblen(p2)) + l_total = pg_mblen_range(p1, end1); + if (l_total != pg_mblen_range(p2, end2)) { return i; } + l_left_to_match = l_total; } if (*p1 != *p2) { if (tinfo->eml > 1) { - return (i - l + 1); + int32 l_matched_subset = l_total - l_left_to_match; + + /* end common prefix at final byte of last matching char */ + return i - l_matched_subset; } else { @@ -155,7 +166,7 @@ gbt_var_node_cp_len(const GBT_VARKEY *node, const gbtree_vinfo *tinfo) p1++; p2++; - l--; + l_left_to_match--; i++; } return ml; /* lower == upper */ diff --git a/contrib/dict_xsyn/dict_xsyn.c b/contrib/dict_xsyn/dict_xsyn.c index e538928aba4..19db56f0352 100644 --- a/contrib/dict_xsyn/dict_xsyn.c +++ b/contrib/dict_xsyn/dict_xsyn.c @@ -48,15 +48,15 @@ find_word(char *in, char **end) char *start; *end = NULL; - while (*in && t_isspace(in)) - in += pg_mblen(in); + while (*in && t_isspace_cstr(in)) + in += pg_mblen_cstr(in); if (!*in || *in == '#') return NULL; start = in; - while (*in && !t_isspace(in)) - in += pg_mblen(in); + while (*in && !t_isspace_cstr(in)) + in += pg_mblen_cstr(in); *end = in; diff --git a/contrib/hstore/hstore_io.c b/contrib/hstore/hstore_io.c index 999ddad76d9..08c1b216aa2 100644 --- a/contrib/hstore/hstore_io.c +++ b/contrib/hstore/hstore_io.c @@ -64,7 +64,7 @@ prssyntaxerror(HSParser *state) errsave(state->escontext, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error in hstore, near \"%.*s\" at position %d", - pg_mblen(state->ptr), state->ptr, + pg_mblen_cstr(state->ptr), state->ptr, (int) (state->ptr - state->begin)))); /* In soft error situation, return false as convenience for caller */ return false; diff --git a/contrib/intarray/_int_selfuncs.c b/contrib/intarray/_int_selfuncs.c index d4793b0b638..015649ab334 100644 --- a/contrib/intarray/_int_selfuncs.c +++ b/contrib/intarray/_int_selfuncs.c @@ -19,6 +19,7 @@ #include "catalog/pg_operator.h" #include "catalog/pg_statistic.h" #include "catalog/pg_type.h" +#include "commands/extension.h" #include "miscadmin.h" #include "utils/builtins.h" #include "utils/lsyscache.h" @@ -171,7 +172,18 @@ _int_matchsel(PG_FUNCTION_ARGS) PG_RETURN_FLOAT8(0.0); } - /* The caller made sure the const is a query, so get it now */ + /* + * Verify that the Const is a query_int, else return a default estimate. + * (This could only fail if someone attached this estimator to the wrong + * operator.) + */ + if (((Const *) other)->consttype != + get_function_sibling_type(fcinfo->flinfo->fn_oid, "query_int")) + { + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8(DEFAULT_EQ_SEL); + } + query = DatumGetQueryTypeP(((Const *) other)->constvalue); /* Empty query matches nothing */ diff --git a/contrib/ltree/lquery_op.c b/contrib/ltree/lquery_op.c index a6466f575fd..bbc1f9f5c72 100644 --- a/contrib/ltree/lquery_op.c +++ b/contrib/ltree/lquery_op.c @@ -27,14 +27,14 @@ getlexeme(char *start, char *end, int *len) char *ptr; while (start < end && t_iseq(start, '_')) - start += pg_mblen(start); + start += pg_mblen_range(start, end); ptr = start; if (ptr >= end) return NULL; while (ptr < end && !t_iseq(ptr, '_')) - ptr += pg_mblen(ptr); + ptr += pg_mblen_range(ptr, end); *len = ptr - start; return start; diff --git a/contrib/ltree/ltree.h b/contrib/ltree/ltree.h index 5e0761641d3..cf24add69f7 100644 --- a/contrib/ltree/ltree.h +++ b/contrib/ltree/ltree.h @@ -127,7 +127,7 @@ typedef struct #define LQUERY_HASNOT 0x01 /* valid label chars are alphanumerics, underscores and hyphens */ -#define ISLABEL(x) ( t_isalnum(x) || t_iseq(x, '_') || t_iseq(x, '-') ) +#define ISLABEL(x) ( t_isalnum_cstr(x) || t_iseq(x, '_') || t_iseq(x, '-') ) /* full text query */ diff --git a/contrib/ltree/ltree_io.c b/contrib/ltree/ltree_io.c index 3a0a4266870..24d2bf67def 100644 --- a/contrib/ltree/ltree_io.c +++ b/contrib/ltree/ltree_io.c @@ -56,7 +56,7 @@ parse_ltree(const char *buf, struct Node *escontext) ptr = buf; while (*ptr) { - charlen = pg_mblen(ptr); + charlen = pg_mblen_cstr(ptr); if (t_iseq(ptr, '.')) num++; ptr += charlen; @@ -71,7 +71,7 @@ parse_ltree(const char *buf, struct Node *escontext) ptr = buf; while (*ptr) { - charlen = pg_mblen(ptr); + charlen = pg_mblen_cstr(ptr); switch (state) { @@ -293,7 +293,7 @@ parse_lquery(const char *buf, struct Node *escontext) ptr = buf; while (*ptr) { - charlen = pg_mblen(ptr); + charlen = pg_mblen_cstr(ptr); if (t_iseq(ptr, '.')) num++; @@ -313,7 +313,7 @@ parse_lquery(const char *buf, struct Node *escontext) ptr = buf; while (*ptr) { - charlen = pg_mblen(ptr); + charlen = pg_mblen_cstr(ptr); switch (state) { @@ -418,7 +418,7 @@ parse_lquery(const char *buf, struct Node *escontext) case LQPRS_WAITFNUM: if (t_iseq(ptr, ',')) state = LQPRS_WAITSNUM; - else if (t_isdigit(ptr)) + else if (t_isdigit_cstr(ptr)) { int low = atoi(ptr); @@ -436,7 +436,7 @@ parse_lquery(const char *buf, struct Node *escontext) UNCHAR; break; case LQPRS_WAITSNUM: - if (t_isdigit(ptr)) + if (t_isdigit_cstr(ptr)) { int high = atoi(ptr); @@ -467,7 +467,7 @@ parse_lquery(const char *buf, struct Node *escontext) case LQPRS_WAITCLOSE: if (t_iseq(ptr, '}')) state = LQPRS_WAITEND; - else if (!t_isdigit(ptr)) + else if (!t_isdigit_cstr(ptr)) UNCHAR; break; case LQPRS_WAITND: @@ -478,7 +478,7 @@ parse_lquery(const char *buf, struct Node *escontext) } else if (t_iseq(ptr, ',')) state = LQPRS_WAITSNUM; - else if (!t_isdigit(ptr)) + else if (!t_isdigit_cstr(ptr)) UNCHAR; break; case LQPRS_WAITEND: diff --git a/contrib/ltree/ltxtquery_io.c b/contrib/ltree/ltxtquery_io.c index 2c27ebd180f..0f2954f31ba 100644 --- a/contrib/ltree/ltxtquery_io.c +++ b/contrib/ltree/ltxtquery_io.c @@ -64,7 +64,7 @@ gettoken_query(QPRS_STATE *state, int32 *val, int32 *lenval, char **strval, uint for (;;) { - charlen = pg_mblen(state->buf); + charlen = pg_mblen_cstr(state->buf); switch (state->state) { @@ -88,7 +88,7 @@ gettoken_query(QPRS_STATE *state, int32 *val, int32 *lenval, char **strval, uint *lenval = charlen; *flag = 0; } - else if (!t_isspace(state->buf)) + else if (!t_isspace_cstr(state->buf)) ereturn(state->escontext, ERR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("operand syntax error"))); diff --git a/contrib/pageinspect/heapfuncs.c b/contrib/pageinspect/heapfuncs.c index e8f43b8bdbf..803472e0928 100644 --- a/contrib/pageinspect/heapfuncs.c +++ b/contrib/pageinspect/heapfuncs.c @@ -101,7 +101,7 @@ text_to_bits(char *str, int len) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid character \"%.*s\" in t_bits string", - pg_mblen(str + off), str + off))); + pg_mblen_cstr(str + off), str + off))); if (off % 8 == 7) bits[off / 8] = byte; diff --git a/contrib/pg_trgm/Makefile b/contrib/pg_trgm/Makefile index f8ecb34a2d2..faae60f8869 100644 --- a/contrib/pg_trgm/Makefile +++ b/contrib/pg_trgm/Makefile @@ -14,7 +14,7 @@ DATA = pg_trgm--1.5--1.6.sql pg_trgm--1.4--1.5.sql pg_trgm--1.3--1.4.sql \ pg_trgm--1.0--1.1.sql PGFILEDESC = "pg_trgm - trigram matching" -REGRESS = pg_trgm pg_word_trgm pg_strict_word_trgm +REGRESS = pg_trgm pg_utf8_trgm pg_word_trgm pg_strict_word_trgm REGRESS_OPTS += --init-file=$(top_srcdir)/src/test/regress/init_file ifdef USE_PGXS diff --git a/contrib/pg_trgm/data/trgm_utf8.data b/contrib/pg_trgm/data/trgm_utf8.data new file mode 100644 index 00000000000..713856e76a6 --- /dev/null +++ b/contrib/pg_trgm/data/trgm_utf8.data @@ -0,0 +1,50 @@ +Mathematics +数学 +गणित +Matemáticas +رياضيات +Mathématiques +গণিত +Matemática +Математика +ریاضی +Matematika +Mathematik +数学 +Mathematics +गणित +గణితం +Matematik +கணிதம் +數學 +Toán học +Matematika +数学 +수학 +ریاضی +Lissafi +Hisabati +Matematika +Matematica +ریاضی +ಗಣಿತ +ગણિત +คณิตศาสตร์ +ሂሳብ +गणित +ਗਣਿਤ +數學 +数学 +Iṣiro +數學 +သင်္ချာ +Herrega +رياضي +गणित +Математика +Matematyka +ഗണിതം +Matematika +رياضي +Matematika +Matematică diff --git a/contrib/pg_trgm/expected/pg_utf8_trgm.out b/contrib/pg_trgm/expected/pg_utf8_trgm.out new file mode 100644 index 00000000000..0768e7d6a83 --- /dev/null +++ b/contrib/pg_trgm/expected/pg_utf8_trgm.out @@ -0,0 +1,8 @@ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif +-- Index 50 translations of the word "Mathematics" +CREATE TEMP TABLE mb (s text); +\copy mb from 'data/trgm_utf8.data' +CREATE INDEX ON mb USING gist(s gist_trgm_ops); diff --git a/contrib/pg_trgm/expected/pg_utf8_trgm_1.out b/contrib/pg_trgm/expected/pg_utf8_trgm_1.out new file mode 100644 index 00000000000..8505c4fa552 --- /dev/null +++ b/contrib/pg_trgm/expected/pg_utf8_trgm_1.out @@ -0,0 +1,3 @@ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit diff --git a/contrib/pg_trgm/meson.build b/contrib/pg_trgm/meson.build index 093ac18400c..bd3a34f2557 100644 --- a/contrib/pg_trgm/meson.build +++ b/contrib/pg_trgm/meson.build @@ -39,6 +39,7 @@ tests += { 'regress': { 'sql': [ 'pg_trgm', + 'pg_utf8_trgm', 'pg_word_trgm', 'pg_strict_word_trgm', ], diff --git a/contrib/pg_trgm/sql/pg_utf8_trgm.sql b/contrib/pg_trgm/sql/pg_utf8_trgm.sql new file mode 100644 index 00000000000..0dd962ced83 --- /dev/null +++ b/contrib/pg_trgm/sql/pg_utf8_trgm.sql @@ -0,0 +1,9 @@ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif + +-- Index 50 translations of the word "Mathematics" +CREATE TEMP TABLE mb (s text); +\copy mb from 'data/trgm_utf8.data' +CREATE INDEX ON mb USING gist(s gist_trgm_ops); diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h index afb0adb222b..1bd41fa1901 100644 --- a/contrib/pg_trgm/trgm.h +++ b/contrib/pg_trgm/trgm.h @@ -52,10 +52,10 @@ typedef char trgm[3]; } while(0) #ifdef KEEPONLYALNUM -#define ISWORDCHR(c) (t_isalnum(c)) +#define ISWORDCHR(c, len) (t_isalnum_with_len(c, len)) #define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') ) #else -#define ISWORDCHR(c) (!t_isspace(c)) +#define ISWORDCHR(c, len) (!t_isspace_with_len(c, len)) #define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && isprint( *(unsigned char*)(a) ) ) #endif #define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) ) diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c index 49d4497b4f3..32c390257b3 100644 --- a/contrib/pg_trgm/trgm_op.c +++ b/contrib/pg_trgm/trgm_op.c @@ -173,18 +173,29 @@ static char * find_word(char *str, int lenstr, char **endword, int *charlen) { char *beginword = str; + const char *endstr = str + lenstr; - while (beginword - str < lenstr && !ISWORDCHR(beginword)) - beginword += pg_mblen(beginword); + while (beginword < endstr) + { + int clen = pg_mblen_range(beginword, endstr); - if (beginword - str >= lenstr) + if (ISWORDCHR(beginword, clen)) + break; + beginword += clen; + } + + if (beginword >= endstr) return NULL; *endword = beginword; *charlen = 0; - while (*endword - str < lenstr && ISWORDCHR(*endword)) + while (*endword < endstr) { - *endword += pg_mblen(*endword); + int clen = pg_mblen_range(*endword, endstr); + + if (!ISWORDCHR(*endword, clen)) + break; + *endword += clen; (*charlen)++; } @@ -232,9 +243,9 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen) if (bytelen > charlen) { /* Find multibyte character boundaries and apply compact_trigram */ - int lenfirst = pg_mblen(str), - lenmiddle = pg_mblen(str + lenfirst), - lenlast = pg_mblen(str + lenfirst + lenmiddle); + int lenfirst = pg_mblen_unbounded(str), + lenmiddle = pg_mblen_unbounded(str + lenfirst), + lenlast = pg_mblen_unbounded(str + lenfirst + lenmiddle); while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen) { @@ -245,7 +256,7 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen) lenfirst = lenmiddle; lenmiddle = lenlast; - lenlast = pg_mblen(ptr + lenfirst + lenmiddle); + lenlast = pg_mblen_unbounded(ptr + lenfirst + lenmiddle); } } else @@ -730,6 +741,7 @@ get_wildcard_part(const char *str, int lenstr, { const char *beginword = str; const char *endword; + const char *endstr = str + lenstr; char *s = buf; bool in_leading_wildcard_meta = false; bool in_trailing_wildcard_meta = false; @@ -742,11 +754,13 @@ get_wildcard_part(const char *str, int lenstr, * from this loop to the next one, since we may exit at a word character * that is in_escape. */ - while (beginword - str < lenstr) + while (beginword < endstr) { + clen = pg_mblen_range(beginword, endstr); + if (in_escape) { - if (ISWORDCHR(beginword)) + if (ISWORDCHR(beginword, clen)) break; in_escape = false; in_leading_wildcard_meta = false; @@ -757,12 +771,12 @@ get_wildcard_part(const char *str, int lenstr, in_escape = true; else if (ISWILDCARDCHAR(beginword)) in_leading_wildcard_meta = true; - else if (ISWORDCHR(beginword)) + else if (ISWORDCHR(beginword, clen)) break; else in_leading_wildcard_meta = false; } - beginword += pg_mblen(beginword); + beginword += clen; } /* @@ -795,12 +809,12 @@ get_wildcard_part(const char *str, int lenstr, * string boundary. Strip escapes during copy. */ endword = beginword; - while (endword - str < lenstr) + while (endword < endstr) { - clen = pg_mblen(endword); + clen = pg_mblen_range(endword, endstr); if (in_escape) { - if (ISWORDCHR(endword)) + if (ISWORDCHR(endword, clen)) { memcpy(s, endword, clen); (*charlen)++; @@ -828,7 +842,7 @@ get_wildcard_part(const char *str, int lenstr, in_trailing_wildcard_meta = true; break; } - else if (ISWORDCHR(endword)) + else if (ISWORDCHR(endword, clen)) { memcpy(s, endword, clen); (*charlen)++; diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c index 1d369460671..6d797c72203 100644 --- a/contrib/pg_trgm/trgm_regexp.c +++ b/contrib/pg_trgm/trgm_regexp.c @@ -481,7 +481,7 @@ static TRGM *createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph, static void RE_compile(regex_t *regex, text *text_re, int cflags, Oid collation); static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA); -static bool convertPgWchar(pg_wchar c, trgm_mb_char *result); +static int convertPgWchar(pg_wchar c, trgm_mb_char *result); static void transformGraph(TrgmNFA *trgmNFA); static void processState(TrgmNFA *trgmNFA, TrgmState *state); static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key); @@ -806,10 +806,11 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA) for (j = 0; j < charsCount; j++) { trgm_mb_char c; + int clen = convertPgWchar(chars[j], &c); - if (!convertPgWchar(chars[j], &c)) + if (!clen) continue; /* ok to ignore it altogether */ - if (ISWORDCHR(c.bytes)) + if (ISWORDCHR(c.bytes, clen)) colorInfo->wordChars[colorInfo->wordCharsCount++] = c; else colorInfo->containsNonWord = true; @@ -821,13 +822,15 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA) /* * Convert pg_wchar to multibyte format. - * Returns false if the character should be ignored completely. + * Returns 0 if the character should be ignored completely, else returns its + * byte length. */ -static bool +static int convertPgWchar(pg_wchar c, trgm_mb_char *result) { /* "s" has enough space for a multibyte character and a trailing NUL */ char s[MAX_MULTIBYTE_CHAR_LEN + 1]; + int clen; /* * We can ignore the NUL character, since it can never appear in a PG text @@ -835,11 +838,11 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result) * reconstructing trigrams. */ if (c == 0) - return false; + return 0; /* Do the conversion, making sure the result is NUL-terminated */ memset(s, 0, sizeof(s)); - pg_wchar2mb_with_len(&c, s, 1); + clen = pg_wchar2mb_with_len(&c, s, 1); /* * In IGNORECASE mode, we can ignore uppercase characters. We assume that @@ -861,7 +864,7 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result) if (strcmp(lowerCased, s) != 0) { pfree(lowerCased); - return false; + return 0; } pfree(lowerCased); } @@ -869,7 +872,7 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result) /* Fill result with exactly MAX_MULTIBYTE_CHAR_LEN bytes */ memcpy(result->bytes, s, MAX_MULTIBYTE_CHAR_LEN); - return true; + return clen; } diff --git a/contrib/pgcrypto/Makefile b/contrib/pgcrypto/Makefile index 647952af597..8b6dabc9fe0 100644 --- a/contrib/pgcrypto/Makefile +++ b/contrib/pgcrypto/Makefile @@ -45,7 +45,8 @@ REGRESS = init md5 sha1 hmac-md5 hmac-sha1 blowfish rijndael sm4\ sha2 des 3des cast5 \ crypt-des crypt-md5 crypt-blowfish crypt-xdes \ pgp-armor pgp-decrypt pgp-encrypt $(CF_PGP_TESTS) \ - pgp-pubkey-decrypt pgp-pubkey-encrypt pgp-info \ + pgp-pubkey-decrypt pgp-pubkey-encrypt pgp-pubkey-session \ + pgp-info \ setup_fips EXTRA_CLEAN = gen-rtab diff --git a/contrib/pgcrypto/expected/pgp-decrypt.out b/contrib/pgcrypto/expected/pgp-decrypt.out index eb049ba9d44..1db89e8c00a 100644 --- a/contrib/pgcrypto/expected/pgp-decrypt.out +++ b/contrib/pgcrypto/expected/pgp-decrypt.out @@ -315,7 +315,7 @@ SaV9L04ky1qECNDx3XjnoKLC+H7IOQ== \xda39a3ee5e6b4b0d3255bfef95601890afd80709 (1 row) -select digest(pgp_sym_decrypt(dearmor(' +select digest(pgp_sym_decrypt_bytea(dearmor(' -----BEGIN PGP MESSAGE----- Comment: dat3.aes.sha1.mdc.s2k3.z0 @@ -387,6 +387,27 @@ ERROR: Wrong key or corrupt data select pgp_sym_decrypt(pgp_sym_encrypt_bytea('P', 'key'), 'key', 'debug=1'); NOTICE: dbg: parse_literal_data: data type=b ERROR: Not text data +-- NUL byte in text decrypt. Ciphertext source: +-- printf 'a\x00\xc' | gpg --homedir /nonexistent --textmode \ +-- --personal-cipher-preferences aes --no-emit-version --batch \ +-- --symmetric --passphrase key --armor +do $$ +begin + perform pgp_sym_decrypt(dearmor(' +-----BEGIN PGP MESSAGE----- + +jA0EBwMCLd9OvySmZNZg0jgBe7vGTmnje5HGXI+zsIQ99WPZu4Zs/P6pQcZ+HZ4n +SZQHOfE8tagjB6Rqow82QpSBiOfWn4qjhQ== +=c2cz +-----END PGP MESSAGE----- +'), 'key', 'debug=1'); +exception when others then + raise '%', + regexp_replace(sqlerrm, 'encoding "[^"]*"', 'encoding [REDACTED]'); +end +$$; +ERROR: invalid byte sequence for encoding [REDACTED]: 0x00 +CONTEXT: PL/pgSQL function inline_code_block line 12 at RAISE -- Decryption with a certain incorrect key yields an apparent BZip2-compressed -- plaintext. Ciphertext source: iterative pgp_sym_encrypt('secret', 'key') -- until the random prefix gave rise to that property. diff --git a/contrib/pgcrypto/expected/pgp-decrypt_1.out b/contrib/pgcrypto/expected/pgp-decrypt_1.out index 80a4c48613d..d214e0bc0e0 100644 --- a/contrib/pgcrypto/expected/pgp-decrypt_1.out +++ b/contrib/pgcrypto/expected/pgp-decrypt_1.out @@ -311,7 +311,7 @@ SaV9L04ky1qECNDx3XjnoKLC+H7IOQ== \xda39a3ee5e6b4b0d3255bfef95601890afd80709 (1 row) -select digest(pgp_sym_decrypt(dearmor(' +select digest(pgp_sym_decrypt_bytea(dearmor(' -----BEGIN PGP MESSAGE----- Comment: dat3.aes.sha1.mdc.s2k3.z0 @@ -383,6 +383,27 @@ ERROR: Wrong key or corrupt data select pgp_sym_decrypt(pgp_sym_encrypt_bytea('P', 'key'), 'key', 'debug=1'); NOTICE: dbg: parse_literal_data: data type=b ERROR: Not text data +-- NUL byte in text decrypt. Ciphertext source: +-- printf 'a\x00\xc' | gpg --homedir /nonexistent --textmode \ +-- --personal-cipher-preferences aes --no-emit-version --batch \ +-- --symmetric --passphrase key --armor +do $$ +begin + perform pgp_sym_decrypt(dearmor(' +-----BEGIN PGP MESSAGE----- + +jA0EBwMCLd9OvySmZNZg0jgBe7vGTmnje5HGXI+zsIQ99WPZu4Zs/P6pQcZ+HZ4n +SZQHOfE8tagjB6Rqow82QpSBiOfWn4qjhQ== +=c2cz +-----END PGP MESSAGE----- +'), 'key', 'debug=1'); +exception when others then + raise '%', + regexp_replace(sqlerrm, 'encoding "[^"]*"', 'encoding [REDACTED]'); +end +$$; +ERROR: invalid byte sequence for encoding [REDACTED]: 0x00 +CONTEXT: PL/pgSQL function inline_code_block line 12 at RAISE -- Decryption with a certain incorrect key yields an apparent BZip2-compressed -- plaintext. Ciphertext source: iterative pgp_sym_encrypt('secret', 'key') -- until the random prefix gave rise to that property. diff --git a/contrib/pgcrypto/expected/pgp-pubkey-session.out b/contrib/pgcrypto/expected/pgp-pubkey-session.out new file mode 100644 index 00000000000..f724d98eb24 --- /dev/null +++ b/contrib/pgcrypto/expected/pgp-pubkey-session.out @@ -0,0 +1,47 @@ +-- Test for overflow with session key at decrypt. +-- Data automatically generated by scripts/pgp_session_data.py. +-- See this file for details explaining how this data is generated. +SELECT pgp_pub_decrypt_bytea( +'\xc1c04c030000000000000000020800a46f5b9b1905b49457a6485474f71ed9b46c2527e1 +da08e1f7871e12c3d38828f2076b984a595bf60f616599ca5729d547de06a258bfbbcd30 +94a321e4668cd43010f0ca8ecf931e5d39bda1152c50c367b11c723f270729245d3ebdbd +0694d320c5a5aa6a405fb45182acb3d7973cbce398e0c5060af7603cfd9ed186ebadd616 +3b50ae42bea5f6d14dda24e6d4687b434c175084515d562e896742b0ba9a1c87d5642e10 +a5550379c71cc490a052ada483b5d96526c0a600fc51755052aa77fdf72f7b4989b920e7 +b90f4b30787a46482670d5caecc7a515a926055ad5509d135702ce51a0e4c1033f2d939d +8f0075ec3428e17310da37d3d2d7ad1ce99adcc91cd446c366c402ae1ee38250343a7fcc +0f8bc28020e603d7a4795ef0dcc1c04c030000000000000000020800a46f5b9b1905b494 +57a6485474f71ed9b46c2527e1da08e1f7871e12c3d38828f2076b984a595bf60f616599 +ca5729d547de06a258bfbbcd3094a321e4668cd43010f0ca8ecf931e5d39bda1152c50c3 +67b11c723f270729245d3ebdbd0694d320c5a5aa6a405fb45182acb3d7973cbce398e0c5 +060af7603cfd9ed186ebadd6163b50ae42bea5f6d14dda24e6d4687b434c175084515d56 +2e896742b0ba9a1c87d5642e10a5550379c71cc490a052ada483b5d96526c0a600fc5175 +5052aa77fdf72f7b4989b920e7b90f4b30787a46482670d5caecc7a515a926055ad5509d +135702ce51a0e4c1033f2d939d8f0075ec3428e17310da37d3d2d7ad1ce99adc'::bytea, +'\xc7c2d8046965d657020800eef8bf1515adb1a3ee7825f75c668ea8dd3e3f9d13e958f6ad +9c55adc0c931a4bb00abe1d52cf7bb0c95d537949d277a5292ede375c6b2a67a3bf7d19f +f975bb7e7be35c2d8300dacba360a0163567372f7dc24000cc7cb6170bedc8f3b1f98c12 +07a6cb4de870a4bc61319b139dcc0e20c368fd68f8fd346d2c0b69c5aed560504e2ec6f1 +23086fe3c5540dc4dd155c0c67257c4ada862f90fe172ace344089da8135e92aca5c2709 +f1c1bc521798bb8c0365841496e709bd184132d387e0c9d5f26dc00fd06c3a76ef66a75c +138285038684707a847b7bd33cfbefbf1d336be954a8048946af97a66352adef8e8b5ae4 +c4748c6f2510265b7a8267bc370dbb00110100010007ff7e72d4f95d2d39901ac12ca5c5 +18e767e719e72340c3fab51c8c5ab1c40f31db8eaffe43533fa61e2dbca2c3f4396c0847 +e5434756acbb1f68128f4136bb135710c89137d74538908dac77967de9e821c559700dd9 +de5a2727eec1f5d12d5d74869dd1de45ed369d94a8814d23861dd163f8c27744b26b98f0 +239c2e6dd1e3493b8cc976fdc8f9a5e250f715aa4c3d7d5f237f8ee15d242e8fa941d1a0 +ed9550ab632d992a97518d142802cb0a97b251319bf5742db8d9d8cbaa06cdfba2d75bc9 +9d77a51ff20bd5ba7f15d7af6e85b904de2855d19af08d45f39deb85403033c69c767a8e +74a343b1d6c8911d34ea441ac3850e57808ed3d885835cbe6c79d10400ef16256f3d5c4c +3341516a2d2aa888df81b603f48a27f3666b40f992a857c1d11ff639cd764a9b42d5a1f8 +58b4aeee36b85508bb5e8b91ef88a7737770b330224479d9b44eae8c631bc43628b69549 +507c0a1af0be0dd7696015abea722b571eb35eefc4ab95595378ec12814727443f625fcd +183bb9b3bccf53b54dd0e5e7a50400ffe08537b2d4e6074e4a1727b658cfccdec8962302 +25e300c05690de45f7065c3d40d86f544a64d51a3e94424f9851a16d1322ebdb41fa8a45 +3131f3e2dc94e858e6396722643df382680f815e53bcdcde5da622f50530a83b217f1103 +cdd6e5e9babe1e415bbff28d44bd18c95f43bbd04afeb2a2a99af38a571c7540de21df03 +ff62c0a33d9143dd3f639893f47732c11c5a12c6052d1935f4d507b7ae1f76ab0e9a69b8 +7305a7f7c19bd509daf4903bff614bc26d118f03e461469c72c12d3a2bb4f78e4d342ce8 +487723649a01ed2b9eb11c662134502c098d55dfcd361939d8370873422c3da75a515a75 +9ffedfe7df44fb3c20f81650801a30d43b5c90b98b3eee'::bytea); +ERROR: Public key too big diff --git a/contrib/pgcrypto/meson.build b/contrib/pgcrypto/meson.build index df7dd50dbc3..57ebfd7ae6d 100644 --- a/contrib/pgcrypto/meson.build +++ b/contrib/pgcrypto/meson.build @@ -50,6 +50,7 @@ pgcrypto_regress = [ 'pgp-encrypt', 'pgp-pubkey-decrypt', 'pgp-pubkey-encrypt', + 'pgp-pubkey-session', 'pgp-info', ] diff --git a/contrib/pgcrypto/pgp-pgsql.c b/contrib/pgcrypto/pgp-pgsql.c index d9b15b07b0f..838a7c381fc 100644 --- a/contrib/pgcrypto/pgp-pgsql.c +++ b/contrib/pgcrypto/pgp-pgsql.c @@ -631,6 +631,7 @@ pgp_sym_decrypt_text(PG_FUNCTION_ARGS) arg = PG_GETARG_BYTEA_PP(2); res = decrypt_internal(0, 1, data, key, NULL, arg); + pg_verifymbstr(VARDATA_ANY(res), VARSIZE_ANY_EXHDR(res), false); PG_FREE_IF_COPY(data, 0); PG_FREE_IF_COPY(key, 1); @@ -732,6 +733,7 @@ pgp_pub_decrypt_text(PG_FUNCTION_ARGS) arg = PG_GETARG_BYTEA_PP(3); res = decrypt_internal(1, 1, data, key, psw, arg); + pg_verifymbstr(VARDATA_ANY(res), VARSIZE_ANY_EXHDR(res), false); PG_FREE_IF_COPY(data, 0); PG_FREE_IF_COPY(key, 1); diff --git a/contrib/pgcrypto/pgp-pubdec.c b/contrib/pgcrypto/pgp-pubdec.c index a0a5738a40e..2a13aa3e6ad 100644 --- a/contrib/pgcrypto/pgp-pubdec.c +++ b/contrib/pgcrypto/pgp-pubdec.c @@ -157,6 +157,7 @@ pgp_parse_pubenc_sesskey(PGP_Context *ctx, PullFilter *pkt) uint8 *msg; int msglen; PGP_MPI *m; + unsigned sess_key_len; pk = ctx->pub_key; if (pk == NULL) @@ -220,11 +221,19 @@ pgp_parse_pubenc_sesskey(PGP_Context *ctx, PullFilter *pkt) if (res < 0) goto out; + sess_key_len = msglen - 3; + if (sess_key_len > PGP_MAX_KEY) + { + px_debug("incorrect session key length=%u", sess_key_len); + res = PXE_PGP_KEY_TOO_BIG; + goto out; + } + /* * got sesskey */ ctx->cipher_algo = *msg; - ctx->sess_key_len = msglen - 3; + ctx->sess_key_len = sess_key_len; memcpy(ctx->sess_key, msg + 1, ctx->sess_key_len); out: diff --git a/contrib/pgcrypto/px.c b/contrib/pgcrypto/px.c index d35ccca7774..a7cb248f6b7 100644 --- a/contrib/pgcrypto/px.c +++ b/contrib/pgcrypto/px.c @@ -65,6 +65,7 @@ static const struct error_desc px_err_list[] = { {PXE_PGP_UNEXPECTED_PKT, "Unexpected packet in key data"}, {PXE_PGP_MATH_FAILED, "Math operation failed"}, {PXE_PGP_SHORT_ELGAMAL_KEY, "Elgamal keys must be at least 1024 bits long"}, + {PXE_PGP_KEY_TOO_BIG, "Public key too big"}, {PXE_PGP_UNKNOWN_PUBALGO, "Unknown public-key encryption algorithm"}, {PXE_PGP_WRONG_KEY, "Wrong key"}, {PXE_PGP_MULTIPLE_KEYS, diff --git a/contrib/pgcrypto/px.h b/contrib/pgcrypto/px.h index 222d8b1ad2f..bda0524a04a 100644 --- a/contrib/pgcrypto/px.h +++ b/contrib/pgcrypto/px.h @@ -75,7 +75,7 @@ /* -108 is unused */ #define PXE_PGP_MATH_FAILED -109 #define PXE_PGP_SHORT_ELGAMAL_KEY -110 -/* -111 is unused */ +#define PXE_PGP_KEY_TOO_BIG -111 #define PXE_PGP_UNKNOWN_PUBALGO -112 #define PXE_PGP_WRONG_KEY -113 #define PXE_PGP_MULTIPLE_KEYS -114 diff --git a/contrib/pgcrypto/scripts/pgp_session_data.py b/contrib/pgcrypto/scripts/pgp_session_data.py new file mode 100644 index 00000000000..999350bb2bc --- /dev/null +++ b/contrib/pgcrypto/scripts/pgp_session_data.py @@ -0,0 +1,491 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# +# Generate PGP data to check the session key length of the input data provided +# to pgp_pub_decrypt_bytea(). +# +# First, the crafted data is generated from valid RSA data, freshly generated +# by this script each time it is run, see generate_rsa_keypair(). +# Second, the crafted PGP data is built, see build_message_data() and +# build_key_data(). Finally, the resulting SQL script is generated. +# +# This script generates in stdout the SQL file that is used in the regression +# tests of pgcrypto. The following command can be used to regenerate the file +# which should never be manually manipulated: +# python3 scripts/pgp_session_data.py > sql/pgp-pubkey-session.sql + +import os +import re +import struct +import secrets +import sys +import time + +# pwn for binary manipulation (p32, p64) +from pwn import * + +# Cryptographic libraries, to craft the PGP data. +from Crypto.Cipher import AES +from Crypto.PublicKey import RSA +from Crypto.Util.number import inverse + +# AES key used for session key encryption (16 bytes for AES-128) +AES_KEY = b'\x01' * 16 + +def generate_rsa_keypair(key_size: int = 2048) -> dict: + """ + Generate a fresh RSA key pair. + + The generated key includes all components needed for PGP operations: + - n: public modulus (p * q) + - e: public exponent (typically 65537) + - d: private exponent (e^-1 mod phi(n)) + - p, q: prime factors of n + - u: coefficient (p^-1 mod q) for CRT optimization + + The caller can pass the wanted key size in input, for a default of 2048 + bytes. This function returns the RSA key components, after performing + some validation on them. + """ + + start_time = time.time() + + # Generate RSA key + key = RSA.generate(key_size) + + # Extract all key components + rsa_components = { + 'n': key.n, # Public modulus (p * q) + 'e': key.e, # Public exponent (typically 65537) + 'd': key.d, # Private exponent (e^-1 mod phi(n)) + 'p': key.p, # First prime factor + 'q': key.q, # Second prime factor + 'u': inverse(key.p, key.q) # Coefficient for CRT: p^-1 mod q + } + + # Validate key components for correctness + validate_rsa_key(rsa_components) + + return rsa_components + +def validate_rsa_key(rsa: dict) -> None: + """ + Validate a generated RSA key. + + This function performs basic validation to ensure the RSA key is properly + constructed and all components are consistent, at least mathematically. + + Validations performed: + 1. n = p * q (modulus is product of primes) + 2. gcd(e, phi(n)) = 1 (public exponent is coprime to phi(n)) + 3. (d * e) mod(phi(n)) = 1 (private exponent is multiplicative inverse) + 4. (u * p) (mod q) = 1 (coefficient is correct for CRT) + """ + + n, e, d, p, q, u = rsa['n'], rsa['e'], rsa['d'], rsa['p'], rsa['q'], rsa['u'] + + # Check that n = p * q + if n != p * q: + raise ValueError("RSA validation failed: n <> p * q") + + # Check that p and q are different + if p == q: + raise ValueError("RSA validation failed: p = q (not allowed)") + + # Calculate phi(n) = (p-1)(q-1) + phi_n = (p - 1) * (q - 1) + + # Check that gcd(e, phi(n)) = 1 + def gcd(a, b): + while b: + a, b = b, a % b + return a + + if gcd(e, phi_n) != 1: + raise ValueError("RSA validation failed: gcd(e, phi(n)) <> 1") + + # Check that (d * e) mod(phi(n)) = 1 + if (d * e) % phi_n != 1: + raise ValueError("RSA validation failed: d * e <> 1 (mod phi(n))") + + # Check that (u * p) (mod q) = 1 + if (u * p) % q != 1: + raise ValueError("RSA validation failed: u * p <> 1 (mod q)") + +def mpi_encode(x: int) -> bytes: + """ + Encode an integer as an OpenPGP Multi-Precision Integer (MPI). + + Format (RFC 4880, Section 3.2): + - 2 bytes: bit length of the integer (big-endian) + - N bytes: the integer in big-endian format + + This is used to encode RSA key components (n, e, d, p, q, u) in PGP + packets. + + The integer to encode is given in input, returning an MPI-encoded + integer. + + For example: + mpi_encode(65537) -> b'\x00\x11\x01\x00\x01' + (17 bits, value 0x010001) + """ + if x < 0: + raise ValueError("MPI cannot encode negative integers") + + if x == 0: + # Special case: zero has 0 bits and empty magnitude + bits = 0 + mag = b"" + else: + # Calculate bit length and convert to bytes + bits = x.bit_length() + mag = x.to_bytes((bits + 7) // 8, 'big') + + # Pack: 2-byte bit length + magnitude bytes + return struct.pack('>H', bits) + mag + +def new_packet(tag: int, payload: bytes) -> bytes: + """ + Create a new OpenPGP packet with a proper header. + + OpenPGP packet format (RFC 4880, Section 4.2): + - New packet format: 0xC0 | tag + - Length encoding depends on payload size: + * 0-191: single byte + * 192-8383: two bytes (192 + ((length - 192) >> 8), (length - 192) & 0xFF) + * 8384+: five bytes (0xFF + 4-byte big-endian length) + + The packet is built from a "tag" (1-63) and some "payload" data. The + result generated is a complete OpenPGP packet. + + For example: + new_packet(1, b'data') -> b'\xC1\x04data' + (Tag 1, length 4, payload 'data') + """ + # New packet format: set bit 7 and 6, clear bit 5, tag in bits 0-5 + first = 0xC0 | (tag & 0x3F) + ln = len(payload) + + # Encode length according to OpenPGP specification + if ln <= 191: + # Single byte length for small packets + llen = bytes([ln]) + elif ln <= 8383: + # Two-byte length for medium packets + ln2 = ln - 192 + llen = bytes([192 + (ln2 >> 8), ln2 & 0xFF]) + else: + # Five-byte length for large packets + llen = bytes([255]) + struct.pack('>I', ln) + + return bytes([first]) + llen + payload + +def build_key_data(rsa: dict) -> bytes: + """ + Build the key data, containing an RSA private key. + + The RSA contents should have been generated previously. + + Format (see RFC 4880, Section 5.5.3): + - 1 byte: version (4) + - 4 bytes: creation time (current Unix timestamp) + - 1 byte: public key algorithm (2 = RSA encrypt) + - MPI: RSA public modulus n + - MPI: RSA public exponent e + - 1 byte: string-to-key usage (0 = no encryption) + - MPI: RSA private exponent d + - MPI: RSA prime p + - MPI: RSA prime q + - MPI: RSA coefficient u = p^-1 mod q + - 2 bytes: checksum of private key material + + This function takes a set of RSA key components in input (n, e, d, p, q, u) + and returns a secret key packet. + """ + + # Public key portion + ver = bytes([4]) # Version 4 key + ctime = struct.pack('>I', int(time.time())) # Current Unix timestamp + algo = bytes([2]) # RSA encrypt algorithm + n_mpi = mpi_encode(rsa['n']) # Public modulus + e_mpi = mpi_encode(rsa['e']) # Public exponent + pub = ver + ctime + algo + n_mpi + e_mpi + + # Private key portion + hide_type = bytes([0]) # No string-to-key encryption + d_mpi = mpi_encode(rsa['d']) # Private exponent + p_mpi = mpi_encode(rsa['p']) # Prime p + q_mpi = mpi_encode(rsa['q']) # Prime q + u_mpi = mpi_encode(rsa['u']) # Coefficient u = p^-1 mod q + + # Calculate checksum of private key material (simple sum mod 65536) + private_data = d_mpi + p_mpi + q_mpi + u_mpi + cksum = sum(private_data) & 0xFFFF + + secret = hide_type + private_data + struct.pack('>H', cksum) + payload = pub + secret + + return new_packet(7, payload) + +def pgp_cfb_encrypt_resync(key, plaintext): + """ + Implement OpenPGP CFB mode with resync. + + OpenPGP CFB mode is a variant of standard CFB with a resync operation + after the first two blocks. + + Algorithm (RFC 4880, Section 13.9): + 1. Block 1: FR=zeros, encrypt full block_size bytes + 2. Block 2: FR=block1, encrypt only 2 bytes + 3. Resync: FR = block1[2:] + block2 + 4. Remaining blocks: standard CFB mode + + This function uses the following arguments: + - key: AES encryption key (16 bytes for AES-128) + - plaintext: Data to encrypt + """ + block_size = 16 # AES block size + cipher = AES.new(key[:16], AES.MODE_ECB) # Use ECB for manual CFB + ciphertext = b'' + + # Block 1: FR=zeros, encrypt full 16 bytes + FR = b'\x00' * block_size + FRE = cipher.encrypt(FR) # Encrypt the feedback register + block1 = bytes(a ^ b for a, b in zip(FRE, plaintext[0:16])) + ciphertext += block1 + + # Block 2: FR=block1, encrypt only 2 bytes + FR = block1 + FRE = cipher.encrypt(FR) + block2 = bytes(a ^ b for a, b in zip(FRE[0:2], plaintext[16:18])) + ciphertext += block2 + + # Resync: FR = block1[2:16] + block2[0:2] + # This is the key difference from standard CFB mode + FR = block1[2:] + block2 + + # Block 3+: Continue with standard CFB mode + pos = 18 + while pos < len(plaintext): + FRE = cipher.encrypt(FR) + chunk_len = min(block_size, len(plaintext) - pos) + chunk = plaintext[pos:pos+chunk_len] + enc_chunk = bytes(a ^ b for a, b in zip(FRE[:chunk_len], chunk)) + ciphertext += enc_chunk + + # Update feedback register for next iteration + if chunk_len == block_size: + FR = enc_chunk + else: + # Partial block: pad with old FR bytes + FR = enc_chunk + FR[chunk_len:] + pos += chunk_len + + return ciphertext + +def build_literal_data_packet(data: bytes) -> bytes: + """ + Build a literal data packet containing a message. + + Format (RFC 4880, Section 5.9): + - 1 byte: data format ('b' = binary, 't' = text, 'u' = UTF-8 text) + - 1 byte: filename length (0 = no filename) + - N bytes: filename (empty in this case) + - 4 bytes: date (current Unix timestamp) + - M bytes: literal data + + The data used to build the packet is given in input, with the generated + result returned. + """ + body = bytes([ + ord('b'), # Binary data format + 0, # Filename length (0 = no filename) + ]) + struct.pack('>I', int(time.time())) + data # Current timestamp + data + + return new_packet(11, body) + +def build_symenc_data_packet(sess_key: bytes, cipher_algo: int, payload: bytes) -> bytes: + """ + Build a symmetrically-encrypted data packet using AES-128-CFB. + + This packet contains encrypted data using the session key. The format + includes a random prefix, for security (see RFC 4880, Section 5.7). + + Packet structure: + - Random prefix (block_size bytes) + - Prefix repeat (last 2 bytes of prefix repeated) + - Encrypted literal data packet + + This function uses the following set of arguments: + - sess_key: Session key for encryption + - cipher_algo: Cipher algorithm identifier (7 = AES-128) + - payload: Data to encrypt (wrapped in literal data packet) + """ + block_size = 16 # AES-128 block size + key = sess_key[:16] # Use first 16 bytes for AES-128 + + # Create random prefix + repeat last 2 bytes (total 18 bytes) + # This is required by OpenPGP for integrity checking + prefix_random = secrets.token_bytes(block_size) + prefix = prefix_random + prefix_random[-2:] # 18 bytes total + + # Wrap payload in literal data packet + literal_pkt = build_literal_data_packet(payload) + + # Plaintext = prefix + literal data packet + plaintext = prefix + literal_pkt + + # Encrypt using OpenPGP CFB mode with resync + ciphertext = pgp_cfb_encrypt_resync(key, plaintext) + + return new_packet(9, ciphertext) + +def build_tag1_packet(rsa: dict, sess_key: bytes) -> bytes: + """ + Build a public-key encrypted key. + + This is a very important function, as it is able to create the packet + triggering the overflow check. This function can also be used to create + "legit" packet data. + + Format (RFC 4880, Section 5.1): + - 1 byte: version (3) + - 8 bytes: key ID (0 = any key accepted) + - 1 byte: public key algorithm (2 = RSA encrypt) + - MPI: RSA-encrypted session key + + This uses in arguments the generated RSA key pair, and the session key + to encrypt. The latter is manipulated to trigger the overflow. + + This function returns a complete packet encrypted by a session key. + """ + + # Calculate RSA modulus size in bytes + n_bytes = (rsa['n'].bit_length() + 7) // 8 + + # Session key message format: + # - 1 byte: symmetric cipher algorithm (7 = AES-128) + # - N bytes: session key + # - 2 bytes: checksum (simple sum of session key bytes) + algo_byte = bytes([7]) # AES-128 algorithm identifier + cksum = sum(sess_key) & 0xFFFF # 16-bit checksum + M = algo_byte + sess_key + struct.pack('>H', cksum) + + # PKCS#1 v1.5 padding construction + # Format: 0x02 || PS || 0x00 || M + # Total padded message must be exactly n_bytes long. + total_len = n_bytes # Total length must equal modulus size in bytes + ps_len = total_len - len(M) - 2 # Subtract 2 for 0x02 and 0x00 bytes + + if ps_len < 8: + raise ValueError(f"Padding string too short ({ps_len} bytes); need at least 8 bytes. " + f"Message length: {len(M)}, Modulus size: {n_bytes} bytes") + + # Create padding string with *ALL* bytes being 0xFF (no zero separator!) + PS = bytes([0xFF]) * ps_len + + # Construct the complete padded message + # Normal PKCS#1 v1.5 padding: 0x02 || PS || 0x00 || M + padded = bytes([0x02]) + PS + bytes([0x00]) + M + + # Verify padding construction + if len(padded) != n_bytes: + raise ValueError(f"Padded message length ({len(padded)}) doesn't match RSA modulus size ({n_bytes})") + + # Convert padded message to integer and encrypt with RSA + m_int = int.from_bytes(padded, 'big') + + # Ensure message is smaller than modulus (required for RSA) + if m_int >= rsa['n']: + raise ValueError("Padded message is larger than RSA modulus") + + # RSA encryption: c = m^e mod n + c_int = pow(m_int, rsa['e'], rsa['n']) + + # Encode encrypted result as MPI + c_mpi = mpi_encode(c_int) + + # Build complete packet + ver = bytes([3]) # Version 3 packet + key_id = b"\x00" * 8 # Key ID (0 = any key accepted) + algo = bytes([2]) # RSA encrypt algorithm + payload = ver + key_id + algo + c_mpi + + return new_packet(1, payload) + +def build_message_data(rsa: dict) -> bytes: + """ + This function creates a crafted message, with a long session key + length. + + This takes in input the RSA key components generated previously, + returning a concatenated set of PGP packets crafted for the purpose + of this test. + """ + + # Base prefix for session key (AES key + padding + size). + # Note that the crafted size is the important part for this test. + prefix = AES_KEY + b"\x00" * 16 + p32(0x10) + + # Build encrypted data packet, legit. + sedata = build_symenc_data_packet(AES_KEY, cipher_algo=7, payload=b"\x0a\x00") + + # Build multiple packets + packets = [ + # First packet, legit. + build_tag1_packet(rsa, prefix), + + # Encrypted data packet, legit. + sedata, + + # Second packet: information payload. + # + # This packet contains a longer-crafted session key, able to trigger + # the overflow check in pgcrypto. This is the critical part, and + # and you are right to pay a lot of attention here if you are + # reading this code. + build_tag1_packet(rsa, prefix) + ] + + return b"".join(packets) + +def main(): + # Default key size. + # This number can be set to a higher number if wanted, like 4096. We + # just do not need to do that here. + key_size = 2048 + + # Generate fresh RSA key pair + rsa = generate_rsa_keypair(key_size) + + # Generate the message data. + print("### Building message data", file=sys.stderr) + message_data = build_message_data(rsa) + + # Build the key containing the RSA private key + print("### Building key data", file=sys.stderr) + key_data = build_key_data(rsa) + + # Convert to hexadecimal, for the bytea used in the SQL file. + message_data = message_data.hex() + key_data = key_data.hex() + + # Split each value into lines of 72 characters, for readability. + message_data = re.sub("(.{72})", "\\1\n", message_data, 0, re.DOTALL) + key_data = re.sub("(.{72})", "\\1\n", key_data, 0, re.DOTALL) + + # Get the script filename for documentation + file_basename = os.path.basename(__file__) + + # Output the SQL test case + print(f'''-- Test for overflow with session key at decrypt. +-- Data automatically generated by scripts/{file_basename}. +-- See this file for details explaining how this data is generated. +SELECT pgp_pub_decrypt_bytea( +'\\x{message_data}'::bytea, +'\\x{key_data}'::bytea);''', + file=sys.stdout) + +if __name__ == "__main__": + main() diff --git a/contrib/pgcrypto/sql/pgp-decrypt.sql b/contrib/pgcrypto/sql/pgp-decrypt.sql index 49a0267bbcb..2fe498f2f02 100644 --- a/contrib/pgcrypto/sql/pgp-decrypt.sql +++ b/contrib/pgcrypto/sql/pgp-decrypt.sql @@ -228,7 +228,7 @@ SaV9L04ky1qECNDx3XjnoKLC+H7IOQ== -----END PGP MESSAGE----- '), '0123456789abcdefghij'), 'sha1'); -select digest(pgp_sym_decrypt(dearmor(' +select digest(pgp_sym_decrypt_bytea(dearmor(' -----BEGIN PGP MESSAGE----- Comment: dat3.aes.sha1.mdc.s2k3.z0 @@ -282,6 +282,26 @@ VsxxqLSPzNLAeIspJk5G -- Routine text/binary mismatch. select pgp_sym_decrypt(pgp_sym_encrypt_bytea('P', 'key'), 'key', 'debug=1'); +-- NUL byte in text decrypt. Ciphertext source: +-- printf 'a\x00\xc' | gpg --homedir /nonexistent --textmode \ +-- --personal-cipher-preferences aes --no-emit-version --batch \ +-- --symmetric --passphrase key --armor +do $$ +begin + perform pgp_sym_decrypt(dearmor(' +-----BEGIN PGP MESSAGE----- + +jA0EBwMCLd9OvySmZNZg0jgBe7vGTmnje5HGXI+zsIQ99WPZu4Zs/P6pQcZ+HZ4n +SZQHOfE8tagjB6Rqow82QpSBiOfWn4qjhQ== +=c2cz +-----END PGP MESSAGE----- +'), 'key', 'debug=1'); +exception when others then + raise '%', + regexp_replace(sqlerrm, 'encoding "[^"]*"', 'encoding [REDACTED]'); +end +$$; + -- Decryption with a certain incorrect key yields an apparent BZip2-compressed -- plaintext. Ciphertext source: iterative pgp_sym_encrypt('secret', 'key') -- until the random prefix gave rise to that property. diff --git a/contrib/pgcrypto/sql/pgp-pubkey-session.sql b/contrib/pgcrypto/sql/pgp-pubkey-session.sql new file mode 100644 index 00000000000..51792f1f4d8 --- /dev/null +++ b/contrib/pgcrypto/sql/pgp-pubkey-session.sql @@ -0,0 +1,46 @@ +-- Test for overflow with session key at decrypt. +-- Data automatically generated by scripts/pgp_session_data.py. +-- See this file for details explaining how this data is generated. +SELECT pgp_pub_decrypt_bytea( +'\xc1c04c030000000000000000020800a46f5b9b1905b49457a6485474f71ed9b46c2527e1 +da08e1f7871e12c3d38828f2076b984a595bf60f616599ca5729d547de06a258bfbbcd30 +94a321e4668cd43010f0ca8ecf931e5d39bda1152c50c367b11c723f270729245d3ebdbd +0694d320c5a5aa6a405fb45182acb3d7973cbce398e0c5060af7603cfd9ed186ebadd616 +3b50ae42bea5f6d14dda24e6d4687b434c175084515d562e896742b0ba9a1c87d5642e10 +a5550379c71cc490a052ada483b5d96526c0a600fc51755052aa77fdf72f7b4989b920e7 +b90f4b30787a46482670d5caecc7a515a926055ad5509d135702ce51a0e4c1033f2d939d +8f0075ec3428e17310da37d3d2d7ad1ce99adcc91cd446c366c402ae1ee38250343a7fcc +0f8bc28020e603d7a4795ef0dcc1c04c030000000000000000020800a46f5b9b1905b494 +57a6485474f71ed9b46c2527e1da08e1f7871e12c3d38828f2076b984a595bf60f616599 +ca5729d547de06a258bfbbcd3094a321e4668cd43010f0ca8ecf931e5d39bda1152c50c3 +67b11c723f270729245d3ebdbd0694d320c5a5aa6a405fb45182acb3d7973cbce398e0c5 +060af7603cfd9ed186ebadd6163b50ae42bea5f6d14dda24e6d4687b434c175084515d56 +2e896742b0ba9a1c87d5642e10a5550379c71cc490a052ada483b5d96526c0a600fc5175 +5052aa77fdf72f7b4989b920e7b90f4b30787a46482670d5caecc7a515a926055ad5509d +135702ce51a0e4c1033f2d939d8f0075ec3428e17310da37d3d2d7ad1ce99adc'::bytea, +'\xc7c2d8046965d657020800eef8bf1515adb1a3ee7825f75c668ea8dd3e3f9d13e958f6ad +9c55adc0c931a4bb00abe1d52cf7bb0c95d537949d277a5292ede375c6b2a67a3bf7d19f +f975bb7e7be35c2d8300dacba360a0163567372f7dc24000cc7cb6170bedc8f3b1f98c12 +07a6cb4de870a4bc61319b139dcc0e20c368fd68f8fd346d2c0b69c5aed560504e2ec6f1 +23086fe3c5540dc4dd155c0c67257c4ada862f90fe172ace344089da8135e92aca5c2709 +f1c1bc521798bb8c0365841496e709bd184132d387e0c9d5f26dc00fd06c3a76ef66a75c +138285038684707a847b7bd33cfbefbf1d336be954a8048946af97a66352adef8e8b5ae4 +c4748c6f2510265b7a8267bc370dbb00110100010007ff7e72d4f95d2d39901ac12ca5c5 +18e767e719e72340c3fab51c8c5ab1c40f31db8eaffe43533fa61e2dbca2c3f4396c0847 +e5434756acbb1f68128f4136bb135710c89137d74538908dac77967de9e821c559700dd9 +de5a2727eec1f5d12d5d74869dd1de45ed369d94a8814d23861dd163f8c27744b26b98f0 +239c2e6dd1e3493b8cc976fdc8f9a5e250f715aa4c3d7d5f237f8ee15d242e8fa941d1a0 +ed9550ab632d992a97518d142802cb0a97b251319bf5742db8d9d8cbaa06cdfba2d75bc9 +9d77a51ff20bd5ba7f15d7af6e85b904de2855d19af08d45f39deb85403033c69c767a8e +74a343b1d6c8911d34ea441ac3850e57808ed3d885835cbe6c79d10400ef16256f3d5c4c +3341516a2d2aa888df81b603f48a27f3666b40f992a857c1d11ff639cd764a9b42d5a1f8 +58b4aeee36b85508bb5e8b91ef88a7737770b330224479d9b44eae8c631bc43628b69549 +507c0a1af0be0dd7696015abea722b571eb35eefc4ab95595378ec12814727443f625fcd +183bb9b3bccf53b54dd0e5e7a50400ffe08537b2d4e6074e4a1727b658cfccdec8962302 +25e300c05690de45f7065c3d40d86f544a64d51a3e94424f9851a16d1322ebdb41fa8a45 +3131f3e2dc94e858e6396722643df382680f815e53bcdcde5da622f50530a83b217f1103 +cdd6e5e9babe1e415bbff28d44bd18c95f43bbd04afeb2a2a99af38a571c7540de21df03 +ff62c0a33d9143dd3f639893f47732c11c5a12c6052d1935f4d507b7ae1f76ab0e9a69b8 +7305a7f7c19bd509daf4903bff614bc26d118f03e461469c72c12d3a2bb4f78e4d342ce8 +487723649a01ed2b9eb11c662134502c098d55dfcd361939d8370873422c3da75a515a75 +9ffedfe7df44fb3c20f81650801a30d43b5c90b98b3eee'::bytea); diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c index 64c879e5470..554843b3548 100644 --- a/contrib/unaccent/unaccent.c +++ b/contrib/unaccent/unaccent.c @@ -149,9 +149,9 @@ initTrie(const char *filename) state = 0; for (ptr = line; *ptr; ptr += ptrlen) { - ptrlen = pg_mblen(ptr); + ptrlen = pg_mblen_cstr(ptr); /* ignore whitespace, but end src or trg */ - if (t_isspace(ptr)) + if (t_isspace_cstr(ptr)) { if (state == 1) state = 2; @@ -315,6 +315,7 @@ unaccent_lexize(PG_FUNCTION_ARGS) char *srcchar = (char *) PG_GETARG_POINTER(1); int32 len = PG_GETARG_INT32(2); char *srcstart = srcchar; + const char *srcend = srcstart + len; TSLexeme *res; StringInfoData buf; @@ -342,7 +343,7 @@ unaccent_lexize(PG_FUNCTION_ARGS) } else { - matchlen = pg_mblen(srcchar); + matchlen = pg_mblen_range(srcchar, srcend); if (buf.data != NULL) appendBinaryStringInfo(&buf, srcchar, matchlen); } diff --git a/pom.xml b/pom.xml index 43c583614a6..0b461661554 100644 --- a/pom.xml +++ b/pom.xml @@ -1802,6 +1802,7 @@ code or new licensing patterns. contrib/btree_gist/btree_bool.c contrib/basic_archive/basic_archive.conf contrib/pg_freespacemap/pg_freespacemap.conf + contrib/pgcrypto/scripts/pgp_session_data.py contrib/pg_walinspect/walinspect.conf contrib/pgrowlocks/specs/pgrowlocks.spec contrib/tcn/specs/tcn.spec diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c index ac21884162f..933aa42ffc9 100644 --- a/src/backend/access/hash/hashfunc.c +++ b/src/backend/access/hash/hashfunc.c @@ -235,6 +235,7 @@ hashoidvector(PG_FUNCTION_ARGS) { oidvector *key = (oidvector *) PG_GETARG_POINTER(0); + check_valid_oidvector(key); return hash_any((unsigned char *) key->values, key->dim1 * sizeof(Oid)); } @@ -243,6 +244,7 @@ hashoidvectorextended(PG_FUNCTION_ARGS) { oidvector *key = (oidvector *) PG_GETARG_POINTER(0); + check_valid_oidvector(key); return hash_any_extended((unsigned char *) key->values, key->dim1 * sizeof(Oid), PG_GETARG_INT64(1)); diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c index 976a2cc6447..720733b75d2 100644 --- a/src/backend/access/nbtree/nbtcompare.c +++ b/src/backend/access/nbtree/nbtcompare.c @@ -299,6 +299,9 @@ btoidvectorcmp(PG_FUNCTION_ARGS) oidvector *b = (oidvector *) PG_GETARG_POINTER(1); int i; + check_valid_oidvector(a); + check_valid_oidvector(b); + /* We arbitrarily choose to sort first by vector length */ if (a->dim1 != b->dim1) PG_RETURN_INT32(a->dim1 - b->dim1); diff --git a/src/backend/catalog/pg_depend.c b/src/backend/catalog/pg_depend.c index 02e0ce71a07..b3d1c2fba99 100644 --- a/src/backend/catalog/pg_depend.c +++ b/src/backend/catalog/pg_depend.c @@ -23,11 +23,13 @@ #include "catalog/pg_constraint.h" #include "catalog/pg_depend.h" #include "catalog/pg_extension.h" +#include "catalog/pg_type.h" #include "commands/extension.h" #include "miscadmin.h" #include "utils/fmgroids.h" #include "utils/lsyscache.h" #include "utils/rel.h" +#include "utils/syscache.h" static bool isObjectPinned(const ObjectAddress *object); @@ -812,6 +814,77 @@ getAutoExtensionsOfObject(Oid classId, Oid objectId) return result; } +/* + * Look up a type belonging to an extension. + * + * Returns the type's OID, or InvalidOid if not found. + * + * Notice that the type is specified by name only, without a schema. + * That's because this will typically be used by relocatable extensions + * which can't make a-priori assumptions about which schema their objects + * are in. As long as the extension only defines one type of this name, + * the answer is unique anyway. + * + * We might later add the ability to look up functions, operators, etc. + */ +Oid +getExtensionType(Oid extensionOid, const char *typname) +{ + Oid result = InvalidOid; + Relation depRel; + ScanKeyData key[3]; + SysScanDesc scan; + HeapTuple tup; + + depRel = table_open(DependRelationId, AccessShareLock); + + ScanKeyInit(&key[0], + Anum_pg_depend_refclassid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(ExtensionRelationId)); + ScanKeyInit(&key[1], + Anum_pg_depend_refobjid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(extensionOid)); + ScanKeyInit(&key[2], + Anum_pg_depend_refobjsubid, + BTEqualStrategyNumber, F_INT4EQ, + Int32GetDatum(0)); + + scan = systable_beginscan(depRel, DependReferenceIndexId, true, + NULL, 3, key); + + while (HeapTupleIsValid(tup = systable_getnext(scan))) + { + Form_pg_depend depform = (Form_pg_depend) GETSTRUCT(tup); + + if (depform->classid == TypeRelationId && + depform->deptype == DEPENDENCY_EXTENSION) + { + Oid typoid = depform->objid; + HeapTuple typtup; + + typtup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(typoid)); + if (!HeapTupleIsValid(typtup)) + continue; /* should we throw an error? */ + if (strcmp(NameStr(((Form_pg_type) GETSTRUCT(typtup))->typname), + typname) == 0) + { + result = typoid; + ReleaseSysCache(typtup); + break; /* no need to keep searching */ + } + ReleaseSysCache(typtup); + } + } + + systable_endscan(scan); + + table_close(depRel, AccessShareLock); + + return result; +} + /* * Detect whether a sequence is marked as "owned" by a column * diff --git a/src/backend/catalog/pg_proc.c b/src/backend/catalog/pg_proc.c index 6285ce06f66..f46e854dd81 100644 --- a/src/backend/catalog/pg_proc.c +++ b/src/backend/catalog/pg_proc.c @@ -1227,7 +1227,7 @@ match_prosrc_to_literal(const char *prosrc, const char *literal, if (cursorpos > 0) newcp++; } - chlen = pg_mblen(prosrc); + chlen = pg_mblen_cstr(prosrc); if (strncmp(prosrc, literal, chlen) != 0) goto fail; prosrc += chlen; diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c index 41e06471f4e..8ec1ffdf209 100644 --- a/src/backend/commands/extension.c +++ b/src/backend/commands/extension.c @@ -46,6 +46,7 @@ #include "catalog/pg_depend.h" #include "catalog/pg_extension.h" #include "catalog/pg_namespace.h" +#include "catalog/pg_proc.h" #include "catalog/pg_type.h" #include "cdb/cdbgang.h" #include "commands/alter.h" @@ -63,10 +64,12 @@ #include "utils/builtins.h" #include "utils/conffiles.h" #include "utils/fmgroids.h" +#include "utils/inval.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/rel.h" #include "utils/snapmgr.h" +#include "utils/syscache.h" #include "utils/varlena.h" #include "catalog/oid_dispatch.h" @@ -114,7 +117,26 @@ typedef struct ExtensionVersionInfo struct ExtensionVersionInfo *previous; /* current best predecessor */ } ExtensionVersionInfo; +/* + * Cache structure for get_function_sibling_type (and maybe later, + * allied lookup functions). + */ +typedef struct ExtensionSiblingCache +{ + struct ExtensionSiblingCache *next; /* list link */ + /* lookup key: requesting function's OID and type name */ + Oid reqfuncoid; + const char *typname; + bool valid; /* is entry currently valid? */ + uint32 exthash; /* cache hash of owning extension's OID */ + Oid typeoid; /* OID associated with typname */ +} ExtensionSiblingCache; + +/* Head of linked list of ExtensionSiblingCache structs */ +static ExtensionSiblingCache *ext_sibling_list = NULL; + /* Local functions */ +static void ext_sibling_callback(Datum arg, int cacheid, uint32 hashvalue); static List *find_update_path(List *evi_list, ExtensionVersionInfo *evi_start, ExtensionVersionInfo *evi_target, @@ -264,6 +286,114 @@ get_extension_schema(Oid ext_oid) return result; } +/* + * get_function_sibling_type - find a type belonging to same extension as func + * + * Returns the type's OID, or InvalidOid if not found. + * + * This is useful in extensions, which won't have fixed object OIDs. + * We work from the calling function's own OID, which it can get from its + * FunctionCallInfo parameter, and look up the owning extension and thence + * a type belonging to the same extension. + * + * Notice that the type is specified by name only, without a schema. + * That's because this will typically be used by relocatable extensions + * which can't make a-priori assumptions about which schema their objects + * are in. As long as the extension only defines one type of this name, + * the answer is unique anyway. + * + * We might later add the ability to look up functions, operators, etc. + * + * This code is simply a frontend for some pg_depend lookups. Those lookups + * are fairly expensive, so we provide a simple cache facility. We assume + * that the passed typname is actually a C constant, or at least permanently + * allocated, so that we need not copy that string. + */ +Oid +get_function_sibling_type(Oid funcoid, const char *typname) +{ + ExtensionSiblingCache *cache_entry; + Oid extoid; + Oid typeoid; + + /* + * See if we have the answer cached. Someday there may be enough callers + * to justify a hash table, but for now, a simple linked list is fine. + */ + for (cache_entry = ext_sibling_list; cache_entry != NULL; + cache_entry = cache_entry->next) + { + if (funcoid == cache_entry->reqfuncoid && + strcmp(typname, cache_entry->typname) == 0) + break; + } + if (cache_entry && cache_entry->valid) + return cache_entry->typeoid; + + /* + * Nope, so do the expensive lookups. We do not expect failures, so we do + * not cache negative results. + */ + extoid = getExtensionOfObject(ProcedureRelationId, funcoid); + if (!OidIsValid(extoid)) + return InvalidOid; + typeoid = getExtensionType(extoid, typname); + if (!OidIsValid(typeoid)) + return InvalidOid; + + /* + * Build, or revalidate, cache entry. + */ + if (cache_entry == NULL) + { + /* Register invalidation hook if this is first entry */ + if (ext_sibling_list == NULL) + CacheRegisterSyscacheCallback(EXTENSIONOID, + ext_sibling_callback, + (Datum) 0); + + /* Momentarily zero the space to ensure valid flag is false */ + cache_entry = (ExtensionSiblingCache *) + MemoryContextAllocZero(CacheMemoryContext, + sizeof(ExtensionSiblingCache)); + cache_entry->next = ext_sibling_list; + ext_sibling_list = cache_entry; + } + + cache_entry->reqfuncoid = funcoid; + cache_entry->typname = typname; + cache_entry->exthash = GetSysCacheHashValue1(EXTENSIONOID, + ObjectIdGetDatum(extoid)); + cache_entry->typeoid = typeoid; + /* Mark it valid only once it's fully populated */ + cache_entry->valid = true; + + return typeoid; +} + +/* + * ext_sibling_callback + * Syscache inval callback function for EXTENSIONOID cache + * + * It seems sufficient to invalidate ExtensionSiblingCache entries when + * the owning extension's pg_extension entry is modified or deleted. + * Neither a requesting function's OID, nor the OID of the object it's + * looking for, could change without an extension update or drop/recreate. + */ +static void +ext_sibling_callback(Datum arg, int cacheid, uint32 hashvalue) +{ + ExtensionSiblingCache *cache_entry; + + for (cache_entry = ext_sibling_list; cache_entry != NULL; + cache_entry = cache_entry->next) + { + if (hashvalue == 0 || + cache_entry->exthash == hashvalue) + cache_entry->valid = false; + } +} + /* * Utility functions to check validity of extension and version names */ diff --git a/src/backend/commands/operatorcmds.c b/src/backend/commands/operatorcmds.c index 51530eb2f56..47e4a02edb9 100644 --- a/src/backend/commands/operatorcmds.c +++ b/src/backend/commands/operatorcmds.c @@ -309,7 +309,6 @@ ValidateRestrictionEstimator(List *restrictionName) { Oid typeId[4]; Oid restrictionOid; - AclResult aclresult; typeId[0] = INTERNALOID; /* PlannerInfo */ typeId[1] = OIDOID; /* operator OID */ @@ -325,11 +324,33 @@ ValidateRestrictionEstimator(List *restrictionName) errmsg("restriction estimator function %s must return type %s", NameListToString(restrictionName), "float8"))); - /* Require EXECUTE rights for the estimator */ - aclresult = object_aclcheck(ProcedureRelationId, restrictionOid, GetUserId(), ACL_EXECUTE); - if (aclresult != ACLCHECK_OK) - aclcheck_error(aclresult, OBJECT_FUNCTION, - NameListToString(restrictionName)); + /* + * If the estimator is not a built-in function, require superuser + * privilege to install it. This protects against using something that is + * not a restriction estimator or has hard-wired assumptions about what + * data types it is working with. (Built-in estimators are required to + * defend themselves adequately against unexpected data type choices, but + * it seems impractical to expect that of extensions' estimators.) + * + * If it is built-in, only require EXECUTE rights. + */ + if (restrictionOid >= FirstGenbkiObjectId) + { + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to specify a non-built-in restriction estimator function"))); + } + else + { + AclResult aclresult; + + aclresult = object_aclcheck(ProcedureRelationId, restrictionOid, + GetUserId(), ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_FUNCTION, + NameListToString(restrictionName)); + } return restrictionOid; } @@ -345,7 +366,6 @@ ValidateJoinEstimator(List *joinName) Oid typeId[5]; Oid joinOid; Oid joinOid2; - AclResult aclresult; typeId[0] = INTERNALOID; /* PlannerInfo */ typeId[1] = OIDOID; /* operator OID */ @@ -383,11 +403,24 @@ ValidateJoinEstimator(List *joinName) errmsg("join estimator function %s must return type %s", NameListToString(joinName), "float8"))); - /* Require EXECUTE rights for the estimator */ - aclresult = object_aclcheck(ProcedureRelationId, joinOid, GetUserId(), ACL_EXECUTE); - if (aclresult != ACLCHECK_OK) - aclcheck_error(aclresult, OBJECT_FUNCTION, - NameListToString(joinName)); + /* privilege checks are the same as in ValidateRestrictionEstimator */ + if (joinOid >= FirstGenbkiObjectId) + { + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to specify a non-built-in join estimator function"))); + } + else + { + AclResult aclresult; + + aclresult = object_aclcheck(ProcedureRelationId, joinOid, + GetUserId(), ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_FUNCTION, + NameListToString(joinName)); + } return joinOid; } diff --git a/src/backend/tsearch/dict_synonym.c b/src/backend/tsearch/dict_synonym.c index c7cf7c04b60..fe7ca30ce97 100644 --- a/src/backend/tsearch/dict_synonym.c +++ b/src/backend/tsearch/dict_synonym.c @@ -47,8 +47,8 @@ findwrd(char *in, char **end, uint16 *flags) char *lastchar; /* Skip leading spaces */ - while (*in && t_isspace(in)) - in += pg_mblen(in); + while (*in && t_isspace_cstr(in)) + in += pg_mblen_cstr(in); /* Return NULL on empty lines */ if (*in == '\0') @@ -60,10 +60,10 @@ findwrd(char *in, char **end, uint16 *flags) lastchar = start = in; /* Find end of word */ - while (*in && !t_isspace(in)) + while (*in && !t_isspace_cstr(in)) { lastchar = in; - in += pg_mblen(in); + in += pg_mblen_cstr(in); } if (in - lastchar == 1 && t_iseq(lastchar, '*') && flags) diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c index 80402e99375..dcee060ba19 100644 --- a/src/backend/tsearch/dict_thesaurus.c +++ b/src/backend/tsearch/dict_thesaurus.c @@ -190,8 +190,8 @@ thesaurusRead(const char *filename, DictThesaurus *d) ptr = line; /* is it a comment? */ - while (*ptr && t_isspace(ptr)) - ptr += pg_mblen(ptr); + while (*ptr && t_isspace_cstr(ptr)) + ptr += pg_mblen_cstr(ptr); if (t_iseq(ptr, '#') || *ptr == '\0' || t_iseq(ptr, '\n') || t_iseq(ptr, '\r')) @@ -212,7 +212,7 @@ thesaurusRead(const char *filename, DictThesaurus *d) errmsg("unexpected delimiter"))); state = TR_WAITSUBS; } - else if (!t_isspace(ptr)) + else if (!t_isspace_cstr(ptr)) { beginwrd = ptr; state = TR_INLEX; @@ -225,7 +225,7 @@ thesaurusRead(const char *filename, DictThesaurus *d) newLexeme(d, beginwrd, ptr, idsubst, posinsubst++); state = TR_WAITSUBS; } - else if (t_isspace(ptr)) + else if (t_isspace_cstr(ptr)) { newLexeme(d, beginwrd, ptr, idsubst, posinsubst++); state = TR_WAITLEX; @@ -237,15 +237,15 @@ thesaurusRead(const char *filename, DictThesaurus *d) { useasis = true; state = TR_INSUBS; - beginwrd = ptr + pg_mblen(ptr); + beginwrd = ptr + pg_mblen_cstr(ptr); } else if (t_iseq(ptr, '\\')) { useasis = false; state = TR_INSUBS; - beginwrd = ptr + pg_mblen(ptr); + beginwrd = ptr + pg_mblen_cstr(ptr); } - else if (!t_isspace(ptr)) + else if (!t_isspace_cstr(ptr)) { useasis = false; beginwrd = ptr; @@ -254,7 +254,7 @@ thesaurusRead(const char *filename, DictThesaurus *d) } else if (state == TR_INSUBS) { - if (t_isspace(ptr)) + if (t_isspace_cstr(ptr)) { if (ptr == beginwrd) ereport(ERROR, @@ -267,7 +267,7 @@ thesaurusRead(const char *filename, DictThesaurus *d) else elog(ERROR, "unrecognized thesaurus state: %d", state); - ptr += pg_mblen(ptr); + ptr += pg_mblen_cstr(ptr); } if (state == TR_INSUBS) diff --git a/src/backend/tsearch/regis.c b/src/backend/tsearch/regis.c index 0c74c6d0c1c..ee5bc378350 100644 --- a/src/backend/tsearch/regis.c +++ b/src/backend/tsearch/regis.c @@ -37,7 +37,7 @@ RS_isRegis(const char *str) { if (state == RS_IN_WAIT) { - if (t_isalpha(c)) + if (t_isalpha_cstr(c)) /* okay */ ; else if (t_iseq(c, '[')) state = RS_IN_ONEOF; @@ -48,14 +48,14 @@ RS_isRegis(const char *str) { if (t_iseq(c, '^')) state = RS_IN_NONEOF; - else if (t_isalpha(c)) + else if (t_isalpha_cstr(c)) state = RS_IN_ONEOF_IN; else return false; } else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF) { - if (t_isalpha(c)) + if (t_isalpha_cstr(c)) /* okay */ ; else if (t_iseq(c, ']')) state = RS_IN_WAIT; @@ -64,7 +64,7 @@ RS_isRegis(const char *str) } else elog(ERROR, "internal error in RS_isRegis: state %d", state); - c += pg_mblen(c); + c += pg_mblen_cstr(c); } return (state == RS_IN_WAIT); @@ -96,15 +96,14 @@ RS_compile(Regis *r, bool issuffix, const char *str) { if (state == RS_IN_WAIT) { - if (t_isalpha(c)) + if (t_isalpha_cstr(c)) { if (ptr) ptr = newRegisNode(ptr, len); else ptr = r->node = newRegisNode(NULL, len); - COPYCHAR(ptr->data, c); ptr->type = RSF_ONEOF; - ptr->len = pg_mblen(c); + ptr->len = ts_copychar_cstr(ptr->data, c); } else if (t_iseq(c, '[')) { @@ -125,10 +124,9 @@ RS_compile(Regis *r, bool issuffix, const char *str) ptr->type = RSF_NONEOF; state = RS_IN_NONEOF; } - else if (t_isalpha(c)) + else if (t_isalpha_cstr(c)) { - COPYCHAR(ptr->data, c); - ptr->len = pg_mblen(c); + ptr->len = ts_copychar_cstr(ptr->data, c); state = RS_IN_ONEOF_IN; } else /* shouldn't get here */ @@ -136,11 +134,8 @@ RS_compile(Regis *r, bool issuffix, const char *str) } else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF) { - if (t_isalpha(c)) - { - COPYCHAR(ptr->data + ptr->len, c); - ptr->len += pg_mblen(c); - } + if (t_isalpha_cstr(c)) + ptr->len += ts_copychar_cstr(ptr->data + ptr->len, c); else if (t_iseq(c, ']')) state = RS_IN_WAIT; else /* shouldn't get here */ @@ -148,7 +143,7 @@ RS_compile(Regis *r, bool issuffix, const char *str) } else elog(ERROR, "internal error in RS_compile: state %d", state); - c += pg_mblen(c); + c += pg_mblen_cstr(c); } if (state != RS_IN_WAIT) /* shouldn't get here */ @@ -187,10 +182,10 @@ mb_strchr(char *str, char *c) char *ptr = str; bool res = false; - clen = pg_mblen(c); + clen = pg_mblen_cstr(c); while (*ptr && !res) { - plen = pg_mblen(ptr); + plen = pg_mblen_cstr(ptr); if (plen == clen) { i = plen; @@ -219,7 +214,7 @@ RS_execute(Regis *r, char *str) while (*c) { len++; - c += pg_mblen(c); + c += pg_mblen_cstr(c); } if (len < r->nchar) @@ -230,7 +225,7 @@ RS_execute(Regis *r, char *str) { len -= r->nchar; while (len-- > 0) - c += pg_mblen(c); + c += pg_mblen_cstr(c); } @@ -250,7 +245,7 @@ RS_execute(Regis *r, char *str) elog(ERROR, "unrecognized regis node type: %d", ptr->type); } ptr = ptr->next; - c += pg_mblen(c); + c += pg_mblen_cstr(c); } return true; diff --git a/src/backend/tsearch/spell.c b/src/backend/tsearch/spell.c index 8d1d81501f2..8f4fdf2782c 100644 --- a/src/backend/tsearch/spell.c +++ b/src/backend/tsearch/spell.c @@ -232,7 +232,7 @@ findchar(char *str, int c) { if (t_iseq(str, c)) return str; - str += pg_mblen(str); + str += pg_mblen_cstr(str); } return NULL; @@ -245,7 +245,7 @@ findchar2(char *str, int c1, int c2) { if (t_iseq(str, c1) || t_iseq(str, c2)) return str; - str += pg_mblen(str); + str += pg_mblen_cstr(str); } return NULL; @@ -352,6 +352,7 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag) char *next, *sbuf = *sflagset; int maxstep; + int clen; bool stop = false; bool met_comma = false; @@ -363,11 +364,11 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag) { case FM_LONG: case FM_CHAR: - COPYCHAR(sflag, *sflagset); - sflag += pg_mblen(*sflagset); + clen = ts_copychar_cstr(sflag, *sflagset); + sflag += clen; /* Go to start of the next flag */ - *sflagset += pg_mblen(*sflagset); + *sflagset += clen; /* Check if we get all characters of flag */ maxstep--; @@ -391,7 +392,7 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag) *sflagset = next; while (**sflagset) { - if (t_isdigit(*sflagset)) + if (t_isdigit_cstr(*sflagset)) { if (!met_comma) ereport(ERROR, @@ -409,7 +410,7 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag) *sflagset))); met_comma = true; } - else if (!t_isspace(*sflagset)) + else if (!t_isspace_cstr(*sflagset)) { ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), @@ -417,7 +418,7 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag) *sflagset))); } - *sflagset += pg_mblen(*sflagset); + *sflagset += pg_mblen_cstr(*sflagset); } stop = true; break; @@ -543,7 +544,7 @@ NIImportDictionary(IspellDict *Conf, const char *filename) while (*s) { /* we allow only single encoded flags for faster works */ - if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s)) + if (pg_mblen_cstr(s) == 1 && t_isprint_unbounded(s) && !t_isspace_unbounded(s)) s++; else { @@ -559,12 +560,12 @@ NIImportDictionary(IspellDict *Conf, const char *filename) s = line; while (*s) { - if (t_isspace(s)) + if (t_isspace_cstr(s)) { *s = '\0'; break; } - s += pg_mblen(s); + s += pg_mblen_cstr(s); } pstr = lowerstr_ctx(Conf, line); @@ -796,17 +797,17 @@ get_nextfield(char **str, char *next) while (**str) { + int clen = pg_mblen_cstr(*str); + if (state == PAE_WAIT_MASK) { if (t_iseq(*str, '#')) return false; - else if (!t_isspace(*str)) + else if (!t_isspace_cstr(*str)) { - int clen = pg_mblen(*str); - if (clen < avail) { - COPYCHAR(next, *str); + ts_copychar_with_len(next, *str, clen); next += clen; avail -= clen; } @@ -815,24 +816,22 @@ get_nextfield(char **str, char *next) } else /* state == PAE_INMASK */ { - if (t_isspace(*str)) + if (t_isspace_cstr(*str)) { *next = '\0'; return true; } else { - int clen = pg_mblen(*str); - if (clen < avail) { - COPYCHAR(next, *str); + ts_copychar_with_len(next, *str, clen); next += clen; avail -= clen; } } } - *str += pg_mblen(*str); + *str += clen; } *next = '\0'; @@ -922,14 +921,15 @@ parse_affentry(char *str, char *mask, char *find, char *repl) while (*str) { + int clen = pg_mblen_cstr(str); + if (state == PAE_WAIT_MASK) { if (t_iseq(str, '#')) return false; - else if (!t_isspace(str)) + else if (!t_isspace_cstr(str)) { - COPYCHAR(pmask, str); - pmask += pg_mblen(str); + pmask += ts_copychar_with_len(pmask, str, clen); state = PAE_INMASK; } } @@ -940,10 +940,9 @@ parse_affentry(char *str, char *mask, char *find, char *repl) *pmask = '\0'; state = PAE_WAIT_FIND; } - else if (!t_isspace(str)) + else if (!t_isspace_cstr(str)) { - COPYCHAR(pmask, str); - pmask += pg_mblen(str); + pmask += ts_copychar_with_len(pmask, str, clen); } } else if (state == PAE_WAIT_FIND) @@ -952,13 +951,12 @@ parse_affentry(char *str, char *mask, char *find, char *repl) { state = PAE_INFIND; } - else if (t_isalpha(str) || t_iseq(str, '\'') /* english 's */ ) + else if (t_isalpha_cstr(str) || t_iseq(str, '\'') /* english 's */ ) { - COPYCHAR(prepl, str); - prepl += pg_mblen(str); + prepl += ts_copychar_with_len(prepl, str, clen); state = PAE_INREPL; } - else if (!t_isspace(str)) + else if (!t_isspace_cstr(str)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("syntax error"))); @@ -970,12 +968,11 @@ parse_affentry(char *str, char *mask, char *find, char *repl) *pfind = '\0'; state = PAE_WAIT_REPL; } - else if (t_isalpha(str)) + else if (t_isalpha_cstr(str)) { - COPYCHAR(pfind, str); - pfind += pg_mblen(str); + pfind += ts_copychar_with_len(pfind, str, clen); } - else if (!t_isspace(str)) + else if (!t_isspace_cstr(str)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("syntax error"))); @@ -986,13 +983,12 @@ parse_affentry(char *str, char *mask, char *find, char *repl) { break; /* void repl */ } - else if (t_isalpha(str)) + else if (t_isalpha_cstr(str)) { - COPYCHAR(prepl, str); - prepl += pg_mblen(str); + prepl += ts_copychar_with_len(prepl, str, clen); state = PAE_INREPL; } - else if (!t_isspace(str)) + else if (!t_isspace_cstr(str)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("syntax error"))); @@ -1004,12 +1000,11 @@ parse_affentry(char *str, char *mask, char *find, char *repl) *prepl = '\0'; break; } - else if (t_isalpha(str)) + else if (t_isalpha_cstr(str)) { - COPYCHAR(prepl, str); - prepl += pg_mblen(str); + prepl += ts_copychar_with_len(prepl, str, clen); } - else if (!t_isspace(str)) + else if (!t_isspace_cstr(str)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("syntax error"))); @@ -1017,7 +1012,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl) else elog(ERROR, "unrecognized state in parse_affentry: %d", state); - str += pg_mblen(str); + str += clen; } *pmask = *pfind = *prepl = '\0'; @@ -1070,10 +1065,9 @@ addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val) CompoundAffixFlag *newValue; char sbuf[BUFSIZ]; char *sflag; - int clen; - while (*s && t_isspace(s)) - s += pg_mblen(s); + while (*s && t_isspace_cstr(s)) + s += pg_mblen_cstr(s); if (!*s) ereport(ERROR, @@ -1082,10 +1076,10 @@ addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val) /* Get flag without \n */ sflag = sbuf; - while (*s && !t_isspace(s) && *s != '\n') + while (*s && !t_isspace_cstr(s) && *s != '\n') { - clen = pg_mblen(s); - COPYCHAR(sflag, s); + int clen = ts_copychar_cstr(sflag, s); + sflag += clen; s += clen; } @@ -1228,7 +1222,7 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename) while ((recoded = tsearch_readline(&trst)) != NULL) { - if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#')) + if (*recoded == '\0' || t_isspace_cstr(recoded) || t_iseq(recoded, '#')) { pfree(recoded); continue; @@ -1265,8 +1259,8 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename) { char *s = recoded + strlen("FLAG"); - while (*s && t_isspace(s)) - s += pg_mblen(s); + while (*s && t_isspace_cstr(s)) + s += pg_mblen_cstr(s); if (*s) { @@ -1301,7 +1295,7 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename) { int fields_read; - if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#')) + if (*recoded == '\0' || t_isspace_cstr(recoded) || t_iseq(recoded, '#')) goto nextline; fields_read = parse_ooaffentry(recoded, type, sflag, find, repl, mask); @@ -1464,12 +1458,12 @@ NIImportAffixes(IspellDict *Conf, const char *filename) s = findchar2(recoded, 'l', 'L'); if (s) { - while (*s && !t_isspace(s)) - s += pg_mblen(s); - while (*s && t_isspace(s)) - s += pg_mblen(s); + while (*s && !t_isspace_cstr(s)) + s += pg_mblen_cstr(s); + while (*s && t_isspace_cstr(s)) + s += pg_mblen_cstr(s); - if (*s && pg_mblen(s) == 1) + if (*s && pg_mblen_cstr(s) == 1) { addCompoundAffixFlagValue(Conf, s, FF_COMPOUNDFLAG); Conf->usecompound = true; @@ -1497,8 +1491,8 @@ NIImportAffixes(IspellDict *Conf, const char *filename) s = recoded + 4; /* we need non-lowercased string */ flagflags = 0; - while (*s && t_isspace(s)) - s += pg_mblen(s); + while (*s && t_isspace_cstr(s)) + s += pg_mblen_cstr(s); if (*s == '*') { @@ -1519,14 +1513,13 @@ NIImportAffixes(IspellDict *Conf, const char *filename) * be followed by EOL, whitespace, or ':'. Otherwise this is a * new-format flag command. */ - if (*s && pg_mblen(s) == 1) + if (*s && pg_mblen_cstr(s) == 1) { - COPYCHAR(flag, s); + flag[0] = *s++; flag[1] = '\0'; - s++; if (*s == '\0' || *s == '#' || *s == '\n' || *s == ':' || - t_isspace(s)) + t_isspace_cstr(s)) { oldformat = true; goto nextline; @@ -1750,7 +1743,7 @@ NISortDictionary(IspellDict *Conf) (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("invalid affix alias \"%s\"", Conf->Spell[i]->p.flag))); - if (*end != '\0' && !t_isdigit(end) && !t_isspace(end)) + if (*end != '\0' && !t_isdigit_cstr(end) && !t_isspace_cstr(end)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("invalid affix alias \"%s\"", diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c index 537ad288f53..4a01b65d577 100644 --- a/src/backend/tsearch/ts_locale.c +++ b/src/backend/tsearch/ts_locale.c @@ -33,92 +33,44 @@ static void tsearch_readline_callback(void *arg); */ #define WC_BUF_LEN 3 -/* - * The reason these functions use a 3-wchar_t output buffer, not 2 as you - * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be - * getting from char2wchar() is UTF16 not UTF32. A single input character - * may therefore produce a surrogate pair rather than just one wchar_t; - * we also need room for a trailing null. When we do get a surrogate pair, - * we pass just the first code to iswdigit() etc, so that these functions will - * always return false for characters outside the Basic Multilingual Plane. - */ -#define WC_BUF_LEN 3 - -int -t_isdigit(const char *ptr) -{ - int clen = pg_mblen(ptr); - wchar_t character[WC_BUF_LEN]; - pg_locale_t mylocale = 0; /* TODO */ - - if (clen == 1 || database_ctype_is_c) - return isdigit(TOUCHAR(ptr)); - - char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); - - return iswdigit((wint_t) character[0]); -} - -int -t_isspace(const char *ptr) -{ - int clen = pg_mblen(ptr); - wchar_t character[WC_BUF_LEN]; - pg_locale_t mylocale = 0; /* TODO */ - - if (clen == 1 || database_ctype_is_c) - return isspace(TOUCHAR(ptr)); - - char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); - - return iswspace((wint_t) character[0]); -} - -int -t_isalpha(const char *ptr) -{ - int clen = pg_mblen(ptr); - wchar_t character[WC_BUF_LEN]; - pg_locale_t mylocale = 0; /* TODO */ - - if (clen == 1 || database_ctype_is_c) - return isalpha(TOUCHAR(ptr)); - - char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); - - return iswalpha((wint_t) character[0]); -} - -int -t_isalnum(const char *ptr) -{ - int clen = pg_mblen(ptr); - wchar_t character[WC_BUF_LEN]; - pg_locale_t mylocale = 0; /* TODO */ - - if (clen == 1 || database_ctype_is_c) - return isalnum(TOUCHAR(ptr)); - - char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); - - return iswalnum((wint_t) character[0]); -} - -int -t_isprint(const char *ptr) -{ - int clen = pg_mblen(ptr); - wchar_t character[WC_BUF_LEN]; - pg_locale_t mylocale = 0; /* TODO */ - - if (clen == 1 || database_ctype_is_c) - return isprint(TOUCHAR(ptr)); - - char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); - - return iswprint((wint_t) character[0]); +#define GENERATE_T_ISCLASS_DEF(character_class) \ +/* mblen shall be that of the first character */ \ +int \ +t_is##character_class##_with_len(const char *ptr, int mblen) \ +{ \ + int clen = pg_mblen_with_len(ptr, mblen); \ + wchar_t character[WC_BUF_LEN]; \ + pg_locale_t mylocale = 0; /* TODO */ \ + if (clen == 1 || database_ctype_is_c) \ + return is##character_class(TOUCHAR(ptr)); \ + char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); \ + return isw##character_class((wint_t) character[0]); \ +} \ +\ +/* ptr shall point to a NUL-terminated string */ \ +int \ +t_is##character_class##_cstr(const char *ptr) \ +{ \ + return t_is##character_class##_with_len(ptr, pg_mblen_cstr(ptr)); \ +} \ +/* ptr shall point to a string with pre-validated encoding */ \ +int \ +t_is##character_class##_unbounded(const char *ptr) \ +{ \ + return t_is##character_class##_with_len(ptr, pg_mblen_unbounded(ptr)); \ +} \ +/* historical name for _unbounded */ \ +int \ +t_is##character_class(const char *ptr) \ +{ \ + return t_is##character_class##_unbounded(ptr); \ } +GENERATE_T_ISCLASS_DEF(alnum) +GENERATE_T_ISCLASS_DEF(alpha) +GENERATE_T_ISCLASS_DEF(digit) +GENERATE_T_ISCLASS_DEF(print) +GENERATE_T_ISCLASS_DEF(space) /* * Set up to read a file using tsearch_readline(). This facility is diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c index 92afc67a5c4..511ba6be7ff 100644 --- a/src/backend/tsearch/ts_selfuncs.c +++ b/src/backend/tsearch/ts_selfuncs.c @@ -109,12 +109,14 @@ tsmatchsel(PG_FUNCTION_ARGS) * OK, there's a Var and a Const we're dealing with here. We need the * Const to be a TSQuery, else we can't do anything useful. We have to * check this because the Var might be the TSQuery not the TSVector. + * + * Also check that the Var really is a TSVector, in case this estimator is + * mistakenly attached to some other operator. */ - if (((Const *) other)->consttype == TSQUERYOID) + if (((Const *) other)->consttype == TSQUERYOID && + vardata.vartype == TSVECTOROID) { /* tsvector @@ tsquery or the other way around */ - Assert(vardata.vartype == TSVECTOROID); - selec = tsquerysel(&vardata, ((Const *) other)->constvalue); } else diff --git a/src/backend/tsearch/ts_utils.c b/src/backend/tsearch/ts_utils.c index 7c4c2a91123..463e5253558 100644 --- a/src/backend/tsearch/ts_utils.c +++ b/src/backend/tsearch/ts_utils.c @@ -88,8 +88,8 @@ readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *)) char *pbuf = line; /* Trim trailing space */ - while (*pbuf && !t_isspace(pbuf)) - pbuf += pg_mblen(pbuf); + while (*pbuf && !t_isspace_cstr(pbuf)) + pbuf += pg_mblen_cstr(pbuf); *pbuf = '\0'; /* Skip empty lines */ diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c index 9130b148366..05d605ade51 100644 --- a/src/backend/tsearch/wparser_def.c +++ b/src/backend/tsearch/wparser_def.c @@ -1727,7 +1727,8 @@ TParserGet(TParser *prs) prs->state->charlen = 0; else prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen : - pg_mblen(prs->str + prs->state->posbyte); + pg_mblen_range(prs->str + prs->state->posbyte, + prs->str + prs->lenstr); Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr); Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null); diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c index ff3e1af0a1d..ec28adc92c6 100644 --- a/src/backend/utils/adt/arrayfuncs.c +++ b/src/backend/utils/adt/arrayfuncs.c @@ -3803,6 +3803,12 @@ deconstruct_array_builtin(ArrayType *array, elmalign = TYPALIGN_SHORT; break; + case INT4OID: + elmlen = sizeof(int32); + elmbyval = true; + elmalign = TYPALIGN_INT; + break; + case OIDOID: elmlen = sizeof(Oid); elmbyval = true; diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c index e5ac3ad23df..a20fbf18c24 100644 --- a/src/backend/utils/adt/encode.c +++ b/src/backend/utils/adt/encode.c @@ -215,7 +215,7 @@ hex_decode_safe(const char *src, size_t len, char *dst, Node *escontext) ereturn(escontext, 0, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid hexadecimal digit: \"%.*s\"", - pg_mblen(s), s))); + pg_mblen_range(s, srcend), s))); s++; if (s >= srcend) ereturn(escontext, 0, @@ -225,7 +225,7 @@ hex_decode_safe(const char *src, size_t len, char *dst, Node *escontext) ereturn(escontext, 0, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid hexadecimal digit: \"%.*s\"", - pg_mblen(s), s))); + pg_mblen_range(s, srcend), s))); s++; *p++ = (v1 << 4) | v2; } @@ -354,7 +354,7 @@ pg_base64_decode(const char *src, size_t len, char *dst) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid symbol \"%.*s\" found while decoding base64 sequence", - pg_mblen(s - 1), s - 1))); + pg_mblen_range(s - 1, srcend), s - 1))); } /* add it to buffer */ buf = (buf << 6) + b; diff --git a/src/backend/utils/adt/format_type.c b/src/backend/utils/adt/format_type.c index 12402a06379..a9054d11b0c 100644 --- a/src/backend/utils/adt/format_type.c +++ b/src/backend/utils/adt/format_type.c @@ -444,11 +444,15 @@ oidvectortypes(PG_FUNCTION_ARGS) { oidvector *oidArray = (oidvector *) PG_GETARG_POINTER(0); char *result; - int numargs = oidArray->dim1; + int numargs; int num; size_t total; size_t left; + /* validate input before fetching dim1 */ + check_valid_oidvector(oidArray); + numargs = oidArray->dim1; + total = 20 * numargs + 1; result = palloc(total); result[0] = '\0'; diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 738cdf81a7f..35e64caaaa6 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -1385,7 +1385,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw, ereport(ERROR, (errcode(ERRCODE_INVALID_DATETIME_FORMAT), errmsg("invalid datetime format separator: \"%s\"", - pnstrdup(str, pg_mblen(str))))); + pnstrdup(str, pg_mblen_cstr(str))))); if (*str == ' ') n->type = NODE_TYPE_SPACE; @@ -1415,7 +1415,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw, /* backslash quotes the next character, if any */ if (*str == '\\' && *(str + 1)) str++; - chlen = pg_mblen(str); + chlen = pg_mblen_cstr(str); n->type = NODE_TYPE_CHAR; memcpy(n->character, str, chlen); n->character[chlen] = '\0'; @@ -1433,7 +1433,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw, */ if (*str == '\\' && *(str + 1) == '"') str++; - chlen = pg_mblen(str); + chlen = pg_mblen_cstr(str); if ((flags & DCH_FLAG) && is_separator_char(str)) n->type = NODE_TYPE_SEPARATOR; @@ -2138,8 +2138,8 @@ asc_toupper_z(const char *buff) do { \ if (S_THth(_suf)) \ { \ - if (*(ptr)) (ptr) += pg_mblen(ptr); \ - if (*(ptr)) (ptr) += pg_mblen(ptr); \ + if (*(ptr)) (ptr) += pg_mblen_cstr(ptr); \ + if (*(ptr)) (ptr) += pg_mblen_cstr(ptr); \ } \ } while (0) @@ -3345,7 +3345,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, * insist that the consumed character match the format's * character. */ - s += pg_mblen(s); + s += pg_mblen_cstr(s); } continue; } @@ -3367,11 +3367,11 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, if (extra_skip > 0) extra_skip--; else - s += pg_mblen(s); + s += pg_mblen_cstr(s); } else { - int chlen = pg_mblen(s); + int chlen = pg_mblen_cstr(s); /* * Standard mode requires strict match of format characters. @@ -5615,13 +5615,15 @@ NUM_numpart_to_char(NUMProc *Np, int id) static void NUM_eat_non_data_chars(NUMProc *Np, int n, int input_len) { + const char *end = Np->inout + input_len; + while (n-- > 0) { if (OVERLOAD_TEST) break; /* end of input */ if (strchr("0123456789.,+-", *Np->inout_p) != NULL) break; /* it's a data character */ - Np->inout_p += pg_mblen(Np->inout_p); + Np->inout_p += pg_mblen_range(Np->inout_p, end); } } @@ -6074,7 +6076,7 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout, } else { - Np->inout_p += pg_mblen(Np->inout_p); + Np->inout_p += pg_mblen_range(Np->inout_p, Np->inout + input_len); } continue; } diff --git a/src/backend/utils/adt/int.c b/src/backend/utils/adt/int.c index 44d1c7ad0c4..f9a08257ac3 100644 --- a/src/backend/utils/adt/int.c +++ b/src/backend/utils/adt/int.c @@ -134,6 +134,30 @@ buildint2vector(const int16 *int2s, int n) return result; } +/* + * validate that an array object meets the restrictions of int2vector + * + * We need this because there are pathways by which a general int2[] array can + * be cast to int2vector, allowing the type's restrictions to be violated. + * All code that receives an int2vector as a SQL parameter should check this. + */ +static void +check_valid_int2vector(const int2vector *int2Array) +{ + /* + * We insist on ndim == 1 and dataoffset == 0 (that is, no nulls) because + * otherwise the array's layout will not be what calling code expects. We + * needn't be picky about the index lower bound though. Checking elemtype + * is just paranoia. + */ + if (int2Array->ndim != 1 || + int2Array->dataoffset != 0 || + int2Array->elemtype != INT2OID) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("array is not a valid int2vector"))); +} + /* * int2vectorin - converts "num num ..." to internal form */ @@ -208,10 +232,14 @@ int2vectorout(PG_FUNCTION_ARGS) { int2vector *int2Array = (int2vector *) PG_GETARG_POINTER(0); int num, - nnums = int2Array->dim1; + nnums; char *rp; char *result; + /* validate input before fetching dim1 */ + check_valid_int2vector(int2Array); + nnums = int2Array->dim1; + /* assumes sign, 5 digits, ' ' */ rp = result = (char *) palloc(nnums * 7 + 1); for (num = 0; num < nnums; num++) @@ -272,6 +300,7 @@ int2vectorrecv(PG_FUNCTION_ARGS) Datum int2vectorsend(PG_FUNCTION_ARGS) { + /* We don't do check_valid_int2vector, since array_send won't care */ return array_send(fcinfo); } diff --git a/src/backend/utils/adt/jsonfuncs.c b/src/backend/utils/adt/jsonfuncs.c index 70cb922e6b7..42b886c621a 100644 --- a/src/backend/utils/adt/jsonfuncs.c +++ b/src/backend/utils/adt/jsonfuncs.c @@ -682,7 +682,7 @@ report_json_context(JsonLexContext *lex) { /* Advance to next multibyte character */ if (IS_HIGHBIT_SET(*context_start)) - context_start += pg_mblen(context_start); + context_start += pg_mblen_range(context_start, context_end); else context_start++; } diff --git a/src/backend/utils/adt/jsonpath_gram.y b/src/backend/utils/adt/jsonpath_gram.y index adc259d5bf8..c1880c113b8 100644 --- a/src/backend/utils/adt/jsonpath_gram.y +++ b/src/backend/utils/adt/jsonpath_gram.y @@ -527,7 +527,8 @@ makeItemLikeRegex(JsonPathParseItem *expr, JsonPathString *pattern, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("invalid input syntax for type %s", "jsonpath"), errdetail("Unrecognized flag character \"%.*s\" in LIKE_REGEX predicate.", - pg_mblen(flags->val + i), flags->val + i))); + pg_mblen_range(flags->val + i, flags->val + flags->len), + flags->val + i))); break; } } diff --git a/src/backend/utils/adt/levenshtein.c b/src/backend/utils/adt/levenshtein.c index 0763daf0e83..9a84c8d0fc4 100644 --- a/src/backend/utils/adt/levenshtein.c +++ b/src/backend/utils/adt/levenshtein.c @@ -83,6 +83,8 @@ varstr_levenshtein(const char *source, int slen, int *s_char_len = NULL; int j; const char *y; + const char *send = source + slen; + const char *tend = target + tlen; /* * For varstr_levenshtein_less_equal, we have real variables called @@ -183,10 +185,10 @@ varstr_levenshtein(const char *source, int slen, #endif /* - * In order to avoid calling pg_mblen() repeatedly on each character in s, - * we cache all the lengths before starting the main loop -- but if all - * the characters in both strings are single byte, then we skip this and - * use a fast-path in the main loop. If only one string contains + * In order to avoid calling pg_mblen_range() repeatedly on each character + * in s, we cache all the lengths before starting the main loop -- but if + * all the characters in both strings are single byte, then we skip this + * and use a fast-path in the main loop. If only one string contains * multi-byte characters, we still build the array, so that the fast-path * needn't deal with the case where the array hasn't been initialized. */ @@ -198,7 +200,7 @@ varstr_levenshtein(const char *source, int slen, s_char_len = (int *) palloc((m + 1) * sizeof(int)); for (i = 0; i < m; ++i) { - s_char_len[i] = pg_mblen(cp); + s_char_len[i] = pg_mblen_range(cp, send); cp += s_char_len[i]; } s_char_len[i] = 0; @@ -224,7 +226,7 @@ varstr_levenshtein(const char *source, int slen, { int *temp; const char *x = source; - int y_char_len = n != tlen + 1 ? pg_mblen(y) : 1; + int y_char_len = n != tlen + 1 ? pg_mblen_range(y, tend) : 1; int i; #ifdef LEVENSHTEIN_LESS_EQUAL diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c index 33a2f46aab0..776112c695f 100644 --- a/src/backend/utils/adt/like.c +++ b/src/backend/utils/adt/like.c @@ -55,20 +55,20 @@ static int Generic_Text_IC_like(text *str, text *pat, Oid collation); *-------------------- */ static inline int -wchareq(const char *p1, const char *p2) +wchareq(const char *p1, int p1len, const char *p2, int p2len) { - int p1_len; + int p1clen; /* Optimization: quickly compare the first byte. */ if (*p1 != *p2) return 0; - p1_len = pg_mblen(p1); - if (pg_mblen(p2) != p1_len) + p1clen = pg_mblen_with_len(p1, p1len); + if (pg_mblen_with_len(p2, p2len) != p1clen) return 0; /* They are the same length */ - while (p1_len--) + while (p1clen--) { if (*p1++ != *p2++) return 0; @@ -107,11 +107,11 @@ SB_lower_char(unsigned char c, pg_locale_t locale, bool locale_is_c) #define NextByte(p, plen) ((p)++, (plen)--) /* Set up to compile like_match.c for multibyte characters */ -#define CHAREQ(p1, p2) wchareq((p1), (p2)) +#define CHAREQ(p1, p1len, p2, p2len) wchareq((p1), (p1len), (p2), (p2len)) #define NextChar(p, plen) \ - do { int __l = pg_mblen(p); (p) +=__l; (plen) -=__l; } while (0) + do { int __l = pg_mblen_with_len((p), (plen)); (p) +=__l; (plen) -=__l; } while (0) #define CopyAdvChar(dst, src, srclen) \ - do { int __l = pg_mblen(src); \ + do { int __l = pg_mblen_with_len((src), (srclen)); \ (srclen) -= __l; \ while (__l-- > 0) \ *(dst)++ = *(src)++; \ @@ -123,7 +123,7 @@ SB_lower_char(unsigned char c, pg_locale_t locale, bool locale_is_c) #include "like_match.c" /* Set up to compile like_match.c for single-byte characters */ -#define CHAREQ(p1, p2) (*(p1) == *(p2)) +#define CHAREQ(p1, p1len, p2, p2len) (*(p1) == *(p2)) #define NextChar(p, plen) NextByte((p), (plen)) #define CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--) diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c index 8b2dff6d6e2..e586de9efd1 100644 --- a/src/backend/utils/adt/like_match.c +++ b/src/backend/utils/adt/like_match.c @@ -294,6 +294,7 @@ do_like_escape(text *pat, text *esc) errhint("Escape string must be empty or one character."))); e = VARDATA_ANY(esc); + elen = VARSIZE_ANY_EXHDR(esc); /* * If specified escape is '\', just copy the pattern as-is. @@ -312,7 +313,7 @@ do_like_escape(text *pat, text *esc) afterescape = false; while (plen > 0) { - if (CHAREQ(p, e) && !afterescape) + if (CHAREQ(p, plen, e, elen) && !afterescape) { *r++ = '\\'; NextChar(p, plen); diff --git a/src/backend/utils/adt/network_selfuncs.c b/src/backend/utils/adt/network_selfuncs.c index a8c0f954dfa..192df1c569d 100644 --- a/src/backend/utils/adt/network_selfuncs.c +++ b/src/backend/utils/adt/network_selfuncs.c @@ -43,9 +43,9 @@ /* Maximum number of items to consider in join selectivity calculations */ #define MAX_CONSIDERED_ELEMS 1024 -static Selectivity networkjoinsel_inner(Oid operator, +static Selectivity networkjoinsel_inner(Oid operator, int opr_codenum, VariableStatData *vardata1, VariableStatData *vardata2); -static Selectivity networkjoinsel_semi(Oid operator, +static Selectivity networkjoinsel_semi(Oid operator, int opr_codenum, VariableStatData *vardata1, VariableStatData *vardata2); static Selectivity mcv_population(float4 *mcv_numbers, int mcv_nvalues); static Selectivity inet_hist_value_sel(Datum *values, int nvalues, @@ -82,6 +82,7 @@ networksel(PG_FUNCTION_ARGS) Oid operator = PG_GETARG_OID(1); List *args = (List *) PG_GETARG_POINTER(2); int varRelid = PG_GETARG_INT32(3); + int opr_codenum; VariableStatData vardata; Node *other; bool varonleft; @@ -95,6 +96,14 @@ networksel(PG_FUNCTION_ARGS) nullfrac; FmgrInfo proc; + /* + * Before all else, verify that the operator is one of the ones supported + * by this function, which in turn proves that the input datatypes are + * what we expect. Otherwise, attaching this selectivity function to some + * unexpected operator could cause trouble. + */ + opr_codenum = inet_opr_codenum(operator); + /* * If expression is not (variable op something) or (something op * variable), then punt and return a default estimate. @@ -150,13 +159,12 @@ networksel(PG_FUNCTION_ARGS) STATISTIC_KIND_HISTOGRAM, InvalidOid, ATTSTATSSLOT_VALUES)) { - int opr_codenum = inet_opr_codenum(operator); + int h_codenum; /* Commute if needed, so we can consider histogram to be on the left */ - if (!varonleft) - opr_codenum = -opr_codenum; + h_codenum = varonleft ? opr_codenum : -opr_codenum; non_mcv_selec = inet_hist_value_sel(hslot.values, hslot.nvalues, - constvalue, opr_codenum); + constvalue, h_codenum); free_attstatsslot(&hslot); } @@ -203,10 +211,19 @@ networkjoinsel(PG_FUNCTION_ARGS) #endif SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4); double selec; + int opr_codenum; VariableStatData vardata1; VariableStatData vardata2; bool join_is_reversed; + /* + * Before all else, verify that the operator is one of the ones supported + * by this function, which in turn proves that the input datatypes are + * what we expect. Otherwise, attaching this selectivity function to some + * unexpected operator could cause trouble. + */ + opr_codenum = inet_opr_codenum(operator); + get_join_variables(root, args, sjinfo, &vardata1, &vardata2, &join_is_reversed); @@ -220,16 +237,19 @@ networkjoinsel(PG_FUNCTION_ARGS) * Selectivity for left/full join is not exactly the same as inner * join, but we neglect the difference, as eqjoinsel does. */ - selec = networkjoinsel_inner(operator, &vardata1, &vardata2); + selec = networkjoinsel_inner(operator, opr_codenum, + &vardata1, &vardata2); break; case JOIN_SEMI: case JOIN_ANTI: case JOIN_LASJ_NOTIN: /* Here, it's important that we pass the outer var on the left. */ if (!join_is_reversed) - selec = networkjoinsel_semi(operator, &vardata1, &vardata2); + selec = networkjoinsel_semi(operator, opr_codenum, + &vardata1, &vardata2); else selec = networkjoinsel_semi(get_commutator(operator), + -opr_codenum, &vardata2, &vardata1); break; default: @@ -261,7 +281,7 @@ networkjoinsel(PG_FUNCTION_ARGS) * Also, MCV vs histogram selectivity is not neglected as in eqjoinsel_inner(). */ static Selectivity -networkjoinsel_inner(Oid operator, +networkjoinsel_inner(Oid operator, int opr_codenum, VariableStatData *vardata1, VariableStatData *vardata2) { Form_pg_statistic stats; @@ -274,7 +294,6 @@ networkjoinsel_inner(Oid operator, mcv2_exists = false, hist1_exists = false, hist2_exists = false; - int opr_codenum; int mcv1_length = 0, mcv2_length = 0; AttStatsSlot mcv1_slot; @@ -326,8 +345,6 @@ networkjoinsel_inner(Oid operator, memset(&hist2_slot, 0, sizeof(hist2_slot)); } - opr_codenum = inet_opr_codenum(operator); - /* * Calculate selectivity for MCV vs MCV matches. */ @@ -388,7 +405,7 @@ networkjoinsel_inner(Oid operator, * histogram selectivity for semi/anti join cases. */ static Selectivity -networkjoinsel_semi(Oid operator, +networkjoinsel_semi(Oid operator, int opr_codenum, VariableStatData *vardata1, VariableStatData *vardata2) { Form_pg_statistic stats; @@ -402,7 +419,6 @@ networkjoinsel_semi(Oid operator, mcv2_exists = false, hist1_exists = false, hist2_exists = false; - int opr_codenum; FmgrInfo proc; int i, mcv1_length = 0, @@ -456,7 +472,6 @@ networkjoinsel_semi(Oid operator, memset(&hist2_slot, 0, sizeof(hist2_slot)); } - opr_codenum = inet_opr_codenum(operator); fmgr_info(get_opcode(operator), &proc); /* Estimate number of input rows represented by RHS histogram. */ @@ -828,6 +843,9 @@ inet_semi_join_sel(Datum lhs_value, /* * Assign useful code numbers for the subnet inclusion/overlap operators * + * This will throw an error if the operator is not one of the ones we + * support in networksel() and networkjoinsel(). + * * Only inet_masklen_inclusion_cmp() and inet_hist_match_divider() depend * on the exact codes assigned here; but many other places in this file * know that they can negate a code to obtain the code for the commutator diff --git a/src/backend/utils/adt/oid.c b/src/backend/utils/adt/oid.c index 3f7af5b3a06..066511443cf 100644 --- a/src/backend/utils/adt/oid.c +++ b/src/backend/utils/adt/oid.c @@ -106,6 +106,30 @@ buildoidvector(const Oid *oids, int n) return result; } +/* + * validate that an array object meets the restrictions of oidvector + * + * We need this because there are pathways by which a general oid[] array can + * be cast to oidvector, allowing the type's restrictions to be violated. + * All code that receives an oidvector as a SQL parameter should check this. + */ +void +check_valid_oidvector(const oidvector *oidArray) +{ + /* + * We insist on ndim == 1 and dataoffset == 0 (that is, no nulls) because + * otherwise the array's layout will not be what calling code expects. We + * needn't be picky about the index lower bound though. Checking elemtype + * is just paranoia. + */ + if (oidArray->ndim != 1 || + oidArray->dataoffset != 0 || + oidArray->elemtype != OIDOID) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("array is not a valid oidvector"))); +} + /* * oidvectorin - converts "num num ..." to internal form */ @@ -158,10 +182,14 @@ oidvectorout(PG_FUNCTION_ARGS) { oidvector *oidArray = (oidvector *) PG_GETARG_POINTER(0); int num, - nnums = oidArray->dim1; + nnums; char *rp; char *result; + /* validate input before fetching dim1 */ + check_valid_oidvector(oidArray); + nnums = oidArray->dim1; + /* assumes sign, 10 digits, ' ' */ rp = result = (char *) palloc(nnums * 12 + 1); for (num = 0; num < nnums; num++) @@ -224,6 +252,7 @@ oidvectorrecv(PG_FUNCTION_ARGS) Datum oidvectorsend(PG_FUNCTION_ARGS) { + /* We don't do check_valid_oidvector, since array_send won't care */ return array_send(fcinfo); } diff --git a/src/backend/utils/adt/oracle_compat.c b/src/backend/utils/adt/oracle_compat.c index 3b5b794afb3..8d025011e2f 100644 --- a/src/backend/utils/adt/oracle_compat.c +++ b/src/backend/utils/adt/oracle_compat.c @@ -153,8 +153,8 @@ lpad(PG_FUNCTION_ARGS) char *ptr1, *ptr2, *ptr2start, - *ptr2end, *ptr_ret; + const char *ptr2end; int m, s1len, s2len; @@ -199,7 +199,7 @@ lpad(PG_FUNCTION_ARGS) while (m--) { - int mlen = pg_mblen(ptr2); + int mlen = pg_mblen_range(ptr2, ptr2end); memcpy(ptr_ret, ptr2, mlen); ptr_ret += mlen; @@ -212,7 +212,7 @@ lpad(PG_FUNCTION_ARGS) while (s1len--) { - int mlen = pg_mblen(ptr1); + int mlen = pg_mblen_unbounded(ptr1); memcpy(ptr_ret, ptr1, mlen); ptr_ret += mlen; @@ -251,8 +251,8 @@ rpad(PG_FUNCTION_ARGS) char *ptr1, *ptr2, *ptr2start, - *ptr2end, *ptr_ret; + const char *ptr2end; int m, s1len, s2len; @@ -292,11 +292,12 @@ rpad(PG_FUNCTION_ARGS) m = len - s1len; ptr1 = VARDATA_ANY(string1); + ptr_ret = VARDATA(ret); while (s1len--) { - int mlen = pg_mblen(ptr1); + int mlen = pg_mblen_unbounded(ptr1); memcpy(ptr_ret, ptr1, mlen); ptr_ret += mlen; @@ -308,7 +309,7 @@ rpad(PG_FUNCTION_ARGS) while (m--) { - int mlen = pg_mblen(ptr2); + int mlen = pg_mblen_range(ptr2, ptr2end); memcpy(ptr_ret, ptr2, mlen); ptr_ret += mlen; @@ -393,6 +394,7 @@ dotrim(const char *string, int stringlen, */ const char **stringchars; const char **setchars; + const char *setend; int *stringmblen; int *setmblen; int stringnchars; @@ -400,6 +402,7 @@ dotrim(const char *string, int stringlen, int resultndx; int resultnchars; const char *p; + const char *pend; int len; int mblen; const char *str_pos; @@ -410,10 +413,11 @@ dotrim(const char *string, int stringlen, stringnchars = 0; p = string; len = stringlen; + pend = p + len; while (len > 0) { stringchars[stringnchars] = p; - stringmblen[stringnchars] = mblen = pg_mblen(p); + stringmblen[stringnchars] = mblen = pg_mblen_range(p, pend); stringnchars++; p += mblen; len -= mblen; @@ -424,10 +428,11 @@ dotrim(const char *string, int stringlen, setnchars = 0; p = set; len = setlen; + setend = set + setlen; while (len > 0) { setchars[setnchars] = p; - setmblen[setnchars] = mblen = pg_mblen(p); + setmblen[setnchars] = mblen = pg_mblen_range(p, setend); setnchars++; p += mblen; len -= mblen; @@ -805,6 +810,8 @@ translate(PG_FUNCTION_ARGS) *to_end; char *source, *target; + const char *source_end; + const char *from_end; int m, fromlen, tolen, @@ -819,9 +826,11 @@ translate(PG_FUNCTION_ARGS) if (m <= 0) PG_RETURN_TEXT_P(string); source = VARDATA_ANY(string); + source_end = source + m; fromlen = VARSIZE_ANY_EXHDR(from); from_ptr = VARDATA_ANY(from); + from_end = from_ptr + fromlen; tolen = VARSIZE_ANY_EXHDR(to); to_ptr = VARDATA_ANY(to); to_end = to_ptr + tolen; @@ -845,12 +854,12 @@ translate(PG_FUNCTION_ARGS) while (m > 0) { - source_len = pg_mblen(source); + source_len = pg_mblen_range(source, source_end); from_index = 0; for (i = 0; i < fromlen; i += len) { - len = pg_mblen(&from_ptr[i]); + len = pg_mblen_range(&from_ptr[i], from_end); if (len == source_len && memcmp(source, &from_ptr[i], len) == 0) break; @@ -866,11 +875,11 @@ translate(PG_FUNCTION_ARGS) { if (p >= to_end) break; - p += pg_mblen(p); + p += pg_mblen_range(p, to_end); } if (p < to_end) { - len = pg_mblen(p); + len = pg_mblen_range(p, to_end); memcpy(target, p, len); target += len; retlen += len; diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index 702cd52b6d4..d43a0577ee4 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -444,7 +444,7 @@ parse_re_flags(pg_re_flags *flags, text *opts) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid regular expression option: \"%.*s\"", - pg_mblen(opt_p + i), opt_p + i))); + pg_mblen_range(opt_p + i, opt_p + opt_len), opt_p + i))); break; } } @@ -674,12 +674,13 @@ textregexreplace(PG_FUNCTION_ARGS) if (VARSIZE_ANY_EXHDR(opt) > 0) { char *opt_p = VARDATA_ANY(opt); + const char *end_p = opt_p + VARSIZE_ANY_EXHDR(opt); if (*opt_p >= '0' && *opt_p <= '9') ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid regular expression option: \"%.*s\"", - pg_mblen(opt_p), opt_p), + pg_mblen_range(opt_p, end_p), opt_p), errhint("If you meant to use regexp_replace() with a start parameter, cast the fourth argument to integer explicitly."))); } @@ -773,12 +774,14 @@ similar_escape_internal(text *pat_text, text *esc_text) *r; int plen, elen; + const char *pend; bool afterescape = false; bool incharclass = false; int nquotes = 0; p = VARDATA_ANY(pat_text); plen = VARSIZE_ANY_EXHDR(pat_text); + pend = p + plen; if (esc_text == NULL) { /* No ESCAPE clause provided; default to backslash as escape */ @@ -867,7 +870,7 @@ similar_escape_internal(text *pat_text, text *esc_text) if (elen > 1) { - int mblen = pg_mblen(p); + int mblen = pg_mblen_range(p, pend); if (mblen > 1) { diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c index 001f2cc0299..244187e474b 100644 --- a/src/backend/utils/adt/tsquery.c +++ b/src/backend/utils/adt/tsquery.c @@ -120,7 +120,7 @@ get_modifiers(char *buf, int16 *weight, bool *prefix) return buf; buf++; - while (*buf && pg_mblen(buf) == 1) + while (*buf && pg_mblen_cstr(buf) == 1) { switch (*buf) { @@ -197,7 +197,7 @@ parse_phrase_operator(TSQueryParserState pstate, int16 *distance) continue; } - if (!t_isdigit(ptr)) + if (!t_isdigit_cstr(ptr)) return false; errno = 0; @@ -259,12 +259,12 @@ parse_or_operator(TSQueryParserState pstate) return false; /* it shouldn't be a part of any word */ - if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalnum(ptr)) + if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalnum_cstr(ptr)) return false; for (;;) { - ptr += pg_mblen(ptr); + ptr += pg_mblen_cstr(ptr); if (*ptr == '\0') /* got end of string without operand */ return false; @@ -274,7 +274,7 @@ parse_or_operator(TSQueryParserState pstate) * So we still treat OR literal as operation with possibly incorrect * operand and will not search it as lexeme */ - if (!t_isspace(ptr)) + if (!t_isspace_cstr(ptr)) break; } @@ -315,7 +315,7 @@ gettoken_query_standard(TSQueryParserState state, int8 *operator, /* generic syntax error message is fine */ return PT_ERR; } - else if (!t_isspace(state->buf)) + else if (!t_isspace_cstr(state->buf)) { /* * We rely on the tsvector parser to parse the value for @@ -383,14 +383,14 @@ gettoken_query_standard(TSQueryParserState state, int8 *operator, { return (state->count) ? PT_ERR : PT_END; } - else if (!t_isspace(state->buf)) + else if (!t_isspace_cstr(state->buf)) { return PT_ERR; } break; } - state->buf += pg_mblen(state->buf); + state->buf += pg_mblen_cstr(state->buf); } } @@ -444,7 +444,7 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator, state->state = WAITOPERAND; continue; } - else if (!t_isspace(state->buf)) + else if (!t_isspace_cstr(state->buf)) { /* * We rely on the tsvector parser to parse the value for @@ -492,7 +492,7 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator, state->buf++; continue; } - else if (!t_isspace(state->buf)) + else if (!t_isspace_cstr(state->buf)) { /* insert implicit AND between operands */ state->state = WAITOPERAND; @@ -502,7 +502,7 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator, break; } - state->buf += pg_mblen(state->buf); + state->buf += pg_mblen_cstr(state->buf); } } @@ -1014,9 +1014,8 @@ infix(INFIX *in, int parentPriority, bool rightPhraseOp) *(in->cur) = '\\'; in->cur++; } - COPYCHAR(in->cur, op); - clen = pg_mblen(op); + clen = ts_copychar_cstr(in->cur, op); op += clen; in->cur += clen; } diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c index 85c492d122a..39e16f8a7cd 100644 --- a/src/backend/utils/adt/tsvector.c +++ b/src/backend/utils/adt/tsvector.c @@ -320,9 +320,9 @@ tsvectorout(PG_FUNCTION_ARGS) lenbuf = 0, pp; WordEntry *ptr = ARRPTR(out); - char *curbegin, - *curin, + char *curin, *curout; + const char *curend; lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ; for (i = 0; i < out->size; i++) @@ -335,13 +335,14 @@ tsvectorout(PG_FUNCTION_ARGS) curout = outbuf = (char *) palloc(lenbuf); for (i = 0; i < out->size; i++) { - curbegin = curin = STRPTR(out) + ptr->pos; + curin = STRPTR(out) + ptr->pos; + curend = curin + ptr->len; if (i != 0) *curout++ = ' '; *curout++ = '\''; - while (curin - curbegin < ptr->len) + while (curin < curend) { - int len = pg_mblen(curin); + int len = pg_mblen_range(curin, curend); if (t_iseq(curin, '\'')) *curout++ = '\''; diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c index f511a28bb04..ae90e750604 100644 --- a/src/backend/utils/adt/tsvector_op.c +++ b/src/backend/utils/adt/tsvector_op.c @@ -2615,11 +2615,15 @@ ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws) if (ws) { char *buf; + const char *end; buf = VARDATA_ANY(ws); - while (buf - VARDATA_ANY(ws) < VARSIZE_ANY_EXHDR(ws)) + end = buf + VARSIZE_ANY_EXHDR(ws); + while (buf < end) { - if (pg_mblen(buf) == 1) + int len = pg_mblen_range(buf, end); + + if (len == 1) { switch (*buf) { @@ -2643,7 +2647,7 @@ ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws) stat->weight |= 0; } } - buf += pg_mblen(buf); + buf += len; } } diff --git a/src/backend/utils/adt/tsvector_parser.c b/src/backend/utils/adt/tsvector_parser.c index 13e075831fe..e4b91f8d3c4 100644 --- a/src/backend/utils/adt/tsvector_parser.c +++ b/src/backend/utils/adt/tsvector_parser.c @@ -206,10 +206,9 @@ gettoken_tsvector(TSVectorParseState state, else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) || (state->is_web && t_iseq(state->prsbuf, '"'))) PRSSYNTAXERROR; - else if (!t_isspace(state->prsbuf)) + else if (!t_isspace_cstr(state->prsbuf)) { - COPYCHAR(curpos, state->prsbuf); - curpos += pg_mblen(state->prsbuf); + curpos += ts_copychar_cstr(curpos, state->prsbuf); statecode = WAITENDWORD; } } @@ -223,8 +222,7 @@ gettoken_tsvector(TSVectorParseState state, else { RESIZEPRSBUF; - COPYCHAR(curpos, state->prsbuf); - curpos += pg_mblen(state->prsbuf); + curpos += ts_copychar_cstr(curpos, state->prsbuf); Assert(oldstate != 0); statecode = oldstate; } @@ -236,7 +234,7 @@ gettoken_tsvector(TSVectorParseState state, statecode = WAITNEXTCHAR; oldstate = WAITENDWORD; } - else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' || + else if (t_isspace_cstr(state->prsbuf) || *(state->prsbuf) == '\0' || (state->oprisdelim && ISOPERATOR(state->prsbuf)) || (state->is_web && t_iseq(state->prsbuf, '"'))) { @@ -259,8 +257,7 @@ gettoken_tsvector(TSVectorParseState state, else { RESIZEPRSBUF; - COPYCHAR(curpos, state->prsbuf); - curpos += pg_mblen(state->prsbuf); + curpos += ts_copychar_cstr(curpos, state->prsbuf); } } else if (statecode == WAITENDCMPLX) @@ -279,8 +276,7 @@ gettoken_tsvector(TSVectorParseState state, else { RESIZEPRSBUF; - COPYCHAR(curpos, state->prsbuf); - curpos += pg_mblen(state->prsbuf); + curpos += ts_copychar_cstr(curpos, state->prsbuf); } } else if (statecode == WAITCHARCMPLX) @@ -288,8 +284,7 @@ gettoken_tsvector(TSVectorParseState state, if (!state->is_web && t_iseq(state->prsbuf, '\'')) { RESIZEPRSBUF; - COPYCHAR(curpos, state->prsbuf); - curpos += pg_mblen(state->prsbuf); + curpos += ts_copychar_cstr(curpos, state->prsbuf); statecode = WAITENDCMPLX; } else @@ -300,7 +295,7 @@ gettoken_tsvector(TSVectorParseState state, PRSSYNTAXERROR; if (state->oprisdelim) { - /* state->prsbuf+=pg_mblen(state->prsbuf); */ + /* state->prsbuf+=pg_mblen_cstr(state->prsbuf); */ RETURN_TOKEN; } else @@ -317,7 +312,7 @@ gettoken_tsvector(TSVectorParseState state, } else if (statecode == INPOSINFO) { - if (t_isdigit(state->prsbuf)) + if (t_isdigit_cstr(state->prsbuf)) { if (posalen == 0) { @@ -372,10 +367,10 @@ gettoken_tsvector(TSVectorParseState state, PRSSYNTAXERROR; WEP_SETWEIGHT(pos[npos - 1], 0); } - else if (t_isspace(state->prsbuf) || + else if (t_isspace_cstr(state->prsbuf) || *(state->prsbuf) == '\0') RETURN_TOKEN; - else if (!t_isdigit(state->prsbuf)) + else if (!t_isdigit_cstr(state->prsbuf)) PRSSYNTAXERROR; } else /* internal error */ @@ -383,6 +378,6 @@ gettoken_tsvector(TSVectorParseState state, statecode); /* get next char */ - state->prsbuf += pg_mblen(state->prsbuf); + state->prsbuf += pg_mblen_cstr(state->prsbuf); } } diff --git a/src/backend/utils/adt/varbit.c b/src/backend/utils/adt/varbit.c index 7e1457cb9ef..c53356bbb46 100644 --- a/src/backend/utils/adt/varbit.c +++ b/src/backend/utils/adt/varbit.c @@ -233,7 +233,7 @@ bit_in(PG_FUNCTION_ARGS) ereturn(escontext, (Datum) 0, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("\"%.*s\" is not a valid binary digit", - pg_mblen(sp), sp))); + pg_mblen_cstr(sp), sp))); x >>= 1; if (x == 0) @@ -258,7 +258,7 @@ bit_in(PG_FUNCTION_ARGS) ereturn(escontext, (Datum) 0, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("\"%.*s\" is not a valid hexadecimal digit", - pg_mblen(sp), sp))); + pg_mblen_cstr(sp), sp))); if (bc) { @@ -534,7 +534,7 @@ varbit_in(PG_FUNCTION_ARGS) ereturn(escontext, (Datum) 0, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("\"%.*s\" is not a valid binary digit", - pg_mblen(sp), sp))); + pg_mblen_cstr(sp), sp))); x >>= 1; if (x == 0) @@ -559,7 +559,7 @@ varbit_in(PG_FUNCTION_ARGS) ereturn(escontext, (Datum) 0, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("\"%.*s\" is not a valid hexadecimal digit", - pg_mblen(sp), sp))); + pg_mblen_cstr(sp), sp))); if (bc) { diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index b5f0018e8f3..1f8fcd1f406 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -797,8 +797,11 @@ text_catenate(text *t1, text *t2) * charlen_to_bytelen() * Compute the number of bytes occupied by n characters starting at *p * - * It is caller's responsibility that there actually are n characters; - * the string need not be null-terminated. + * The caller shall ensure there are n complete characters. Callers achieve + * this by deriving "n" from regmatch_t findings from searching a wchar array. + * pg_mb2wchar_with_len() skips any trailing incomplete character, so regex + * matches will end no later than the last complete character. (The string + * need not be null-terminated.) */ static int charlen_to_bytelen(const char *p, int n) @@ -813,7 +816,7 @@ charlen_to_bytelen(const char *p, int n) const char *s; for (s = p; n > 0; n--) - s += pg_mblen(s); + s += pg_mblen_unbounded(s); /* caller verified encoding */ return s - p; } @@ -946,6 +949,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) int32 slice_start; int32 slice_size; int32 slice_strlen; + int32 slice_len; text *slice; int32 E1; int32 i; @@ -1015,7 +1019,8 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) slice = (text *) DatumGetPointer(str); /* see if we got back an empty string */ - if (VARSIZE_ANY_EXHDR(slice) == 0) + slice_len = VARSIZE_ANY_EXHDR(slice); + if (slice_len == 0) { if (slice != (text *) DatumGetPointer(str)) pfree(slice); @@ -1024,7 +1029,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) /* Now we can get the actual length of the slice in MB characters */ slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice), - VARSIZE_ANY_EXHDR(slice)); + slice_len); /* * Check that the start position wasn't > slice_strlen. If so, SQL99 @@ -1051,7 +1056,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) */ p = VARDATA_ANY(slice); for (i = 0; i < S1 - 1; i++) - p += pg_mblen(p); + p += pg_mblen_unbounded(p); /* hang onto a pointer to our start position */ s = p; @@ -1061,7 +1066,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) * length. */ for (i = S1; i < E1; i++) - p += pg_mblen(p); + p += pg_mblen_unbounded(p); ret = (text *) palloc(VARHDRSZ + (p - s)); SET_VARSIZE(ret, VARHDRSZ + (p - s)); @@ -1359,6 +1364,8 @@ text_position_next(TextPositionState *state) */ if (state->is_multibyte_char_in_char) { + const char *haystack_end = state->str1 + state->len1; + /* Walk one character at a time, until we reach the match. */ /* the search should never move backwards. */ @@ -1367,7 +1374,7 @@ text_position_next(TextPositionState *state) while (state->refpoint < matchptr) { /* step to next character. */ - state->refpoint += pg_mblen(state->refpoint); + state->refpoint += pg_mblen_range(state->refpoint, haystack_end); state->refpos++; /* @@ -4682,6 +4689,8 @@ split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate) } else { + const char *end_ptr; + /* * When fldsep is NULL, each character in the input string becomes a * separate element in the result set. The separator is effectively @@ -4690,10 +4699,11 @@ split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate) inputstring_len = VARSIZE_ANY_EXHDR(inputstring); start_ptr = VARDATA_ANY(inputstring); + end_ptr = start_ptr + inputstring_len; while (inputstring_len > 0) { - int chunk_len = pg_mblen(start_ptr); + int chunk_len = pg_mblen_range(start_ptr, end_ptr); CHECK_FOR_INTERRUPTS(); @@ -5524,7 +5534,7 @@ text_reverse(PG_FUNCTION_ARGS) { int sz; - sz = pg_mblen(p); + sz = pg_mblen_range(p, endp); dst -= sz; memcpy(dst, p, sz); p += sz; @@ -5685,7 +5695,7 @@ text_format(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unrecognized format() type specifier \"%.*s\"", - pg_mblen(cp), cp), + pg_mblen_range(cp, end_ptr), cp), errhint("For a single \"%%\" use \"%%%%\"."))); /* If indirect width was specified, get its value */ @@ -5806,7 +5816,7 @@ text_format(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unrecognized format() type specifier \"%.*s\"", - pg_mblen(cp), cp), + pg_mblen_range(cp, end_ptr), cp), errhint("For a single \"%%\" use \"%%%%\"."))); break; } diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index 1537adfb7bf..45132fcc0fa 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -2329,8 +2329,7 @@ sqlchar_to_unicode(const char *s) char *utf8string; pg_wchar ret[2]; /* need space for trailing zero */ - /* note we're not assuming s is null-terminated */ - utf8string = pg_server_to_any(s, pg_mblen(s), PG_UTF8); + utf8string = pg_server_to_any(s, pg_mblen_cstr(s), PG_UTF8); pg_encoding_mb2wchar_with_len(PG_UTF8, utf8string, ret, pg_encoding_mblen(PG_UTF8, utf8string)); @@ -2383,7 +2382,7 @@ map_sql_identifier_to_xml_name(const char *ident, bool fully_escaped, initStringInfo(&buf); - for (p = ident; *p; p += pg_mblen(p)) + for (p = ident; *p; p += pg_mblen_cstr(p)) { if (*p == ':' && (p == ident || fully_escaped)) appendStringInfoString(&buf, "_x003A_"); @@ -2408,7 +2407,7 @@ map_sql_identifier_to_xml_name(const char *ident, bool fully_escaped, : !is_valid_xml_namechar(u)) appendStringInfo(&buf, "_x%04X_", (unsigned int) u); else - appendBinaryStringInfo(&buf, p, pg_mblen(p)); + appendBinaryStringInfo(&buf, p, pg_mblen_cstr(p)); } } @@ -2431,7 +2430,7 @@ map_xml_name_to_sql_identifier(const char *name) initStringInfo(&buf); - for (p = name; *p; p += pg_mblen(p)) + for (p = name; *p; p += pg_mblen_cstr(p)) { if (*p == '_' && *(p + 1) == 'x' && isxdigit((unsigned char) *(p + 2)) @@ -2449,7 +2448,7 @@ map_xml_name_to_sql_identifier(const char *name) p += 6; } else - appendBinaryStringInfo(&buf, p, pg_mblen(p)); + appendBinaryStringInfo(&buf, p, pg_mblen_cstr(p)); } return buf.data; diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c index 65c20bde39e..a8901a957eb 100644 --- a/src/backend/utils/cache/syscache.c +++ b/src/backend/utils/cache/syscache.c @@ -43,6 +43,7 @@ #include "catalog/pg_directory_table.h" #include "catalog/pg_enum.h" #include "catalog/pg_event_trigger.h" +#include "catalog/pg_extension.h" #include "catalog/pg_foreign_data_wrapper.h" #include "catalog/pg_foreign_server.h" #include "catalog/pg_foreign_table.h" @@ -823,6 +824,13 @@ static const struct cachedesc cacheinfo[] = { 0 }, 128 + }, + /* intentionally out of alphabetical order, to avoid an ABI break: */ + [EXTENSIONOID] = { + ExtensionRelationId, + ExtensionOidIndexId, + KEY(Anum_pg_extension_oid), + 2 } }; diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index 87ed364aab4..0477acc1e08 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -38,6 +38,7 @@ #include "catalog/namespace.h" #include "mb/pg_wchar.h" #include "utils/builtins.h" +#include "utils/memdebug.h" #include "utils/memutils.h" #include "utils/syscache.h" #include "varatt.h" @@ -100,6 +101,13 @@ perform_default_encoding_conversion(const char *src, int len, bool is_client_to_ FmgrInfo *custom_encoding_proc); static int cliplen(const char *str, int len, int limit); +pg_attribute_noreturn() +static void report_invalid_encoding_int(int encoding, const char *mbstr, + int mblen, int len); + +pg_attribute_noreturn() +static void report_invalid_encoding_db(const char *mbstr, int mblen, int len); + /* * Prepare for a future call to SetClientEncoding. Success should mean @@ -1149,11 +1157,126 @@ pg_encoding_wchar2mb_with_len(int encoding, return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len); } -/* returns the byte length of a multibyte character */ +/* + * Returns the byte length of a multibyte character sequence in a + * null-terminated string. Raises an illegal byte sequence error if the + * sequence would hit a null terminator. + * + * The caller is expected to have checked for a terminator at *mbstr == 0 + * before calling, but some callers want 1 in that case, so this function + * continues that tradition. + * + * This must only be used for strings that have a null-terminator to enable + * bounds detection. + */ +int +pg_mblen_cstr(const char *mbstr) +{ + int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); + + /* + * The .mblen functions return 1 when given a pointer to a terminator. + * Some callers depend on that, so we tolerate it for now. Well-behaved + * callers check the leading byte for a terminator *before* calling. + */ + for (int i = 1; i < length; ++i) + if (unlikely(mbstr[i] == 0)) + report_invalid_encoding_db(mbstr, length, i); + + /* + * String should be NUL-terminated, but checking that would make typical + * callers O(N^2), tripling Valgrind check-world time. Unless + * VALGRIND_EXPENSIVE, check 1 byte after each actual character. (If we + * found a character, not a terminator, the next byte must be a terminator + * or the start of the next character.) If the caller iterates the whole + * string, the last call will diagnose a missing terminator. + */ + if (mbstr[0] != '\0') + { +#ifdef VALGRIND_EXPENSIVE + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, strlen(mbstr)); +#else + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr + length, 1); +#endif + } + + return length; +} + +/* + * Returns the byte length of a multibyte character sequence bounded by a range + * [mbstr, end) of at least one byte in size. Raises an illegal byte sequence + * error if the sequence would exceed the range. + */ +int +pg_mblen_range(const char *mbstr, const char *end) +{ + int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); + + Assert(end > mbstr); +#ifdef VALGRIND_EXPENSIVE + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, end - mbstr); +#else + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length); +#endif + + if (unlikely(mbstr + length > end)) + report_invalid_encoding_db(mbstr, length, end - mbstr); + + return length; +} + +/* + * Returns the byte length of a multibyte character sequence bounded by a range + * extending for 'limit' bytes, which must be at least one. Raises an illegal + * byte sequence error if the sequence would exceed the range. + */ +int +pg_mblen_with_len(const char *mbstr, int limit) +{ + int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); + + Assert(limit >= 1); +#ifdef VALGRIND_EXPENSIVE + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, limit); +#else + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length); +#endif + + if (unlikely(length > limit)) + report_invalid_encoding_db(mbstr, length, limit); + + return length; +} + + +/* + * Returns the length of a multibyte character sequence, without any + * validation of bounds. + * + * PLEASE NOTE: This function can only be used safely if the caller has + * already verified the input string, since otherwise there is a risk of + * overrunning the buffer if the string is invalid. A prior call to a + * pg_mbstrlen* function suffices. + */ +int +pg_mblen_unbounded(const char *mbstr) +{ + int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); + + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length); + + return length; +} + +/* + * Historical name for pg_mblen_unbounded(). Should not be used and will be + * removed in a later version. + */ int pg_mblen(const char *mbstr) { - return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); + return pg_mblen_unbounded(mbstr); } /* returns the display length of a multibyte character */ @@ -1175,14 +1298,14 @@ pg_mbstrlen(const char *mbstr) while (*mbstr) { - mbstr += pg_mblen(mbstr); + mbstr += pg_mblen_cstr(mbstr); len++; } return len; } /* returns the length (counted in wchars) of a multibyte string - * (not necessarily NULL terminated) + * (stops at the first of "limit" or a NUL) */ int pg_mbstrlen_with_len(const char *mbstr, int limit) @@ -1195,7 +1318,7 @@ pg_mbstrlen_with_len(const char *mbstr, int limit) while (limit > 0 && *mbstr) { - int l = pg_mblen(mbstr); + int l = pg_mblen_with_len(mbstr, limit); limit -= l; mbstr += l; @@ -1265,7 +1388,7 @@ pg_mbcharcliplen(const char *mbstr, int len, int limit) while (len > 0 && *mbstr) { - l = pg_mblen(mbstr); + l = pg_mblen_with_len(mbstr, len); nch++; if (nch > limit) break; @@ -1835,12 +1958,19 @@ void report_invalid_encoding(int encoding, const char *mbstr, int len) { int l = pg_encoding_mblen_or_incomplete(encoding, mbstr, len); + + report_invalid_encoding_int(encoding, mbstr, l, len); +} + +static void +report_invalid_encoding_int(int encoding, const char *mbstr, int mblen, int len) +{ char buf[8 * 5 + 1]; char *p = buf; int j, jlimit; - jlimit = Min(l, len); + jlimit = Min(mblen, len); jlimit = Min(jlimit, 8); /* prevent buffer overrun */ for (j = 0; j < jlimit; j++) @@ -1857,6 +1987,12 @@ report_invalid_encoding(int encoding, const char *mbstr, int len) buf))); } +static void +report_invalid_encoding_db(const char *mbstr, int mblen, int len) +{ + report_invalid_encoding_int(GetDatabaseEncoding(), mbstr, mblen, len); +} + /* * report_untranslatable_char: complain about untranslatable character * diff --git a/src/common/wchar.c b/src/common/wchar.c index c0fb19b3f1a..5133e5e5b25 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -82,6 +82,9 @@ * subset to the ASCII routines to ensure consistency. */ +/* No error-reporting facility. Ignore incomplete trailing byte sequence. */ +#define MB2CHAR_NEED_AT_LEAST(len, need) if ((len) < (need)) break + /* * SQL/ASCII */ @@ -127,22 +130,24 @@ pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) while (len > 0 && *from) { - if (*from == SS2 && len >= 2) /* JIS X 0201 (so called "1 byte - * KANA") */ + if (*from == SS2) /* JIS X 0201 (so called "1 byte KANA") */ { + MB2CHAR_NEED_AT_LEAST(len, 2); from++; *to = (SS2 << 8) | *from++; len -= 2; } - else if (*from == SS3 && len >= 3) /* JIS X 0212 KANJI */ + else if (*from == SS3) /* JIS X 0212 KANJI */ { + MB2CHAR_NEED_AT_LEAST(len, 3); from++; *to = (SS3 << 16) | (*from++ << 8); *to |= *from++; len -= 3; } - else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */ + else if (IS_HIGHBIT_SET(*from)) /* JIS X 0208 KANJI */ { + MB2CHAR_NEED_AT_LEAST(len, 2); *to = *from++ << 8; *to |= *from++; len -= 2; @@ -254,22 +259,25 @@ pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) while (len > 0 && *from) { - if (*from == SS2 && len >= 3) /* code set 2 (unused?) */ + if (*from == SS2) /* code set 2 (unused?) */ { + MB2CHAR_NEED_AT_LEAST(len, 3); from++; *to = (SS2 << 16) | (*from++ << 8); *to |= *from++; len -= 3; } - else if (*from == SS3 && len >= 3) /* code set 3 (unused ?) */ + else if (*from == SS3) /* code set 3 (unused ?) */ { + MB2CHAR_NEED_AT_LEAST(len, 3); from++; *to = (SS3 << 16) | (*from++ << 8); *to |= *from++; len -= 3; } - else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */ + else if (IS_HIGHBIT_SET(*from)) /* code set 1 */ { + MB2CHAR_NEED_AT_LEAST(len, 2); *to = *from++ << 8; *to |= *from++; len -= 2; @@ -286,12 +294,22 @@ pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) return cnt; } +/* + * mbverifychar does not accept SS2 or SS3 (CS2 and CS3 are not defined for + * EUC_CN), but mb2wchar_with_len does. Tell a coherent story for code that + * relies on agreement between mb2wchar_with_len and mblen. Invalid text + * datums (e.g. from shared catalogs) reach this. + */ static int pg_euccn_mblen(const unsigned char *s) { int len; - if (IS_HIGHBIT_SET(*s)) + if (*s == SS2) + len = 3; + else if (*s == SS3) + len = 3; + else if (IS_HIGHBIT_SET(*s)) len = 2; else len = 1; @@ -321,23 +339,26 @@ pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) while (len > 0 && *from) { - if (*from == SS2 && len >= 4) /* code set 2 */ + if (*from == SS2) /* code set 2 */ { + MB2CHAR_NEED_AT_LEAST(len, 4); from++; *to = (((uint32) SS2) << 24) | (*from++ << 16); *to |= *from++ << 8; *to |= *from++; len -= 4; } - else if (*from == SS3 && len >= 3) /* code set 3 (unused?) */ + else if (*from == SS3) /* code set 3 (unused?) */ { + MB2CHAR_NEED_AT_LEAST(len, 3); from++; *to = (SS3 << 16) | (*from++ << 8); *to |= *from++; len -= 3; } - else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */ + else if (IS_HIGHBIT_SET(*from)) /* code set 2 */ { + MB2CHAR_NEED_AT_LEAST(len, 2); *to = *from++ << 8; *to |= *from++; len -= 2; @@ -474,8 +495,7 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) } else if ((*from & 0xe0) == 0xc0) { - if (len < 2) - break; /* drop trailing incomplete char */ + MB2CHAR_NEED_AT_LEAST(len, 2); c1 = *from++ & 0x1f; c2 = *from++ & 0x3f; *to = (c1 << 6) | c2; @@ -483,8 +503,7 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) } else if ((*from & 0xf0) == 0xe0) { - if (len < 3) - break; /* drop trailing incomplete char */ + MB2CHAR_NEED_AT_LEAST(len, 3); c1 = *from++ & 0x0f; c2 = *from++ & 0x3f; c3 = *from++ & 0x3f; @@ -493,8 +512,7 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) } else if ((*from & 0xf8) == 0xf0) { - if (len < 4) - break; /* drop trailing incomplete char */ + MB2CHAR_NEED_AT_LEAST(len, 4); c1 = *from++ & 0x07; c2 = *from++ & 0x3f; c3 = *from++ & 0x3f; @@ -757,28 +775,32 @@ pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) while (len > 0 && *from) { - if (IS_LC1(*from) && len >= 2) + if (IS_LC1(*from)) { + MB2CHAR_NEED_AT_LEAST(len, 2); *to = *from++ << 16; *to |= *from++; len -= 2; } - else if (IS_LCPRV1(*from) && len >= 3) + else if (IS_LCPRV1(*from)) { + MB2CHAR_NEED_AT_LEAST(len, 3); from++; *to = *from++ << 16; *to |= *from++; len -= 3; } - else if (IS_LC2(*from) && len >= 3) + else if (IS_LC2(*from)) { + MB2CHAR_NEED_AT_LEAST(len, 3); *to = *from++ << 16; *to |= *from++ << 8; *to |= *from++; len -= 3; } - else if (IS_LCPRV2(*from) && len >= 4) + else if (IS_LCPRV2(*from)) { + MB2CHAR_NEED_AT_LEAST(len, 4); from++; *to = *from++ << 16; *to |= *from++ << 8; @@ -2145,7 +2167,7 @@ pg_encoding_set_invalid(int encoding, char *dst) const pg_wchar_tbl pg_wchar_table[] = { {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1}, /* PG_SQL_ASCII */ {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3}, /* PG_EUC_JP */ - {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2}, /* PG_EUC_CN */ + {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 3}, /* PG_EUC_CN */ {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3}, /* PG_EUC_KR */ {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4}, /* PG_EUC_TW */ {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3}, /* PG_EUC_JIS_2004 */ diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h index 0ea7b3cda81..6a7ae2abea9 100644 --- a/src/include/catalog/dependency.h +++ b/src/include/catalog/dependency.h @@ -245,6 +245,8 @@ extern long changeDependenciesOn(Oid refClassId, Oid oldRefObjectId, extern Oid getExtensionOfObject(Oid classId, Oid objectId); extern List *getAutoExtensionsOfObject(Oid classId, Oid objectId); +extern Oid getExtensionType(Oid extensionOid, const char *typname); + extern bool sequenceIsOwned(Oid seqId, char deptype, Oid *tableId, int32 *colId); extern List *getOwnedSequences(Oid relid); extern Oid getIdentitySequence(Oid relid, AttrNumber attnum, bool missing_ok); diff --git a/src/include/commands/extension.h b/src/include/commands/extension.h index 042ae6ba70d..f2e45cf59ea 100644 --- a/src/include/commands/extension.h +++ b/src/include/commands/extension.h @@ -50,6 +50,8 @@ extern char *get_extension_name(Oid ext_oid); extern Oid get_extension_schema(Oid ext_oid); extern bool extension_file_exists(const char *extensionName); +extern Oid get_function_sibling_type(Oid funcoid, const char *typname); + extern ObjectAddress AlterExtensionNamespace(const char *extensionName, const char *newschema, Oid *oldschema); diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index c2cc2ad0963..cd9027a444a 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -608,7 +608,14 @@ extern int pg_char_and_wchar_strcmp(const char *s1, const pg_wchar *s2); extern int pg_wchar_strncmp(const pg_wchar *s1, const pg_wchar *s2, size_t n); extern int pg_char_and_wchar_strncmp(const char *s1, const pg_wchar *s2, size_t n); extern size_t pg_wchar_strlen(const pg_wchar *str); +extern int pg_mblen_cstr(const char *mbstr); +extern int pg_mblen_range(const char *mbstr, const char *end); +extern int pg_mblen_with_len(const char *mbstr, int limit); +extern int pg_mblen_unbounded(const char *mbstr); + +/* deprecated */ extern int pg_mblen(const char *mbstr); + extern int pg_dsplen(const char *mbstr); extern int pg_mbstrlen(const char *mbstr); extern int pg_mbstrlen_with_len(const char *mbstr, int limit); diff --git a/src/include/tsearch/ts_locale.h b/src/include/tsearch/ts_locale.h index 58d594d4006..787ffb165d1 100644 --- a/src/include/tsearch/ts_locale.h +++ b/src/include/tsearch/ts_locale.h @@ -37,13 +37,37 @@ typedef struct /* The second argument of t_iseq() must be a plain ASCII character */ #define t_iseq(x,c) (TOUCHAR(x) == (unsigned char) (c)) -#define COPYCHAR(d,s) memcpy(d, s, pg_mblen(s)) +/* Copy multibyte character of known byte length, return byte length. */ +static inline int +ts_copychar_with_len(void *dest, const void *src, int length) +{ + memcpy(dest, src, length); + return length; +} + +/* Copy multibyte character from null-terminated string, return byte length. */ +static inline int +ts_copychar_cstr(void *dest, const void *src) +{ + return ts_copychar_with_len(dest, src, pg_mblen_cstr((const char *) src)); +} + +/* Historical macro for the above. */ +#define COPYCHAR ts_copychar_cstr + +#define GENERATE_T_ISCLASS_DECL(character_class) \ +extern int t_is##character_class##_with_len(const char *ptr, int len); \ +extern int t_is##character_class##_cstr(const char *ptr); \ +extern int t_is##character_class##_unbounded(const char *ptr); \ +\ +/* deprecated */ \ +extern int t_is##character_class(const char *ptr); -extern int t_isdigit(const char *ptr); -extern int t_isspace(const char *ptr); -extern int t_isalpha(const char *ptr); -extern int t_isalnum(const char *ptr); -extern int t_isprint(const char *ptr); +GENERATE_T_ISCLASS_DECL(alnum); +GENERATE_T_ISCLASS_DECL(alpha); +GENERATE_T_ISCLASS_DECL(digit); +GENERATE_T_ISCLASS_DECL(print); +GENERATE_T_ISCLASS_DECL(space); extern char *lowerstr(const char *str); extern char *lowerstr_with_len(const char *str, int len); diff --git a/src/include/tsearch/ts_utils.h b/src/include/tsearch/ts_utils.h index d3dc8bae475..48db1b800a1 100644 --- a/src/include/tsearch/ts_utils.h +++ b/src/include/tsearch/ts_utils.h @@ -40,14 +40,12 @@ extern bool gettoken_tsvector(TSVectorParseState state, extern void close_tsvector_parser(TSVectorParseState state); /* phrase operator begins with '<' */ -#define ISOPERATOR(x) \ - ( pg_mblen(x) == 1 && ( *(x) == '!' || \ - *(x) == '&' || \ - *(x) == '|' || \ - *(x) == '(' || \ - *(x) == ')' || \ - *(x) == '<' \ - ) ) +#define ISOPERATOR(x) (*(x) == '!' || \ + *(x) == '&' || \ + *(x) == '|' || \ + *(x) == '(' || \ + *(x) == ')' || \ + *(x) == '<') /* parse_tsquery */ diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index cb309ede6ae..bcba170c327 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -70,6 +70,7 @@ extern int64 get_size_from_segDBs(const char *cmd); /* oid.c */ extern oidvector *buildoidvector(const Oid *oids, int n); +extern void check_valid_oidvector(const oidvector *oidArray); extern Oid oidparse(Node *node); /* pseudotypes.c */ diff --git a/src/include/utils/syscache.h b/src/include/utils/syscache.h index b0658a94bd9..e790dfe2af5 100644 --- a/src/include/utils/syscache.h +++ b/src/include/utils/syscache.h @@ -132,9 +132,12 @@ enum SysCacheIdentifier MVTABLESMVRELOID, USERMAPPINGOID, USERMAPPINGUSERSERVER, - ATTENCODINGNUM + ATTENCODINGNUM, -#define SysCacheSize (ATTENCODINGNUM + 1) + /* intentionally out of alphabetical order, to avoid an ABI break: */ + EXTENSIONOID + +#define SysCacheSize (EXTENSIONOID + 1) }; extern void InitCatalogCache(void); diff --git a/src/test/modules/test_regex/test_regex.c b/src/test/modules/test_regex/test_regex.c index d1dd48a993b..3a470dbae44 100644 --- a/src/test/modules/test_regex/test_regex.c +++ b/src/test/modules/test_regex/test_regex.c @@ -414,7 +414,8 @@ parse_test_flags(test_re_flags *flags, text *opts) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid regular expression test option: \"%.*s\"", - pg_mblen(opt_p + i), opt_p + i))); + pg_mblen_range(opt_p + i, opt_p + opt_len), + opt_p + i))); break; } } diff --git a/src/test/regress/expected/arrays.out b/src/test/regress/expected/arrays.out index 20b401a9ef9..bf69c337ec2 100644 --- a/src/test/regress/expected/arrays.out +++ b/src/test/regress/expected/arrays.out @@ -1601,6 +1601,11 @@ select '[0:1]={1.1,2.2}'::float8[]; (1 row) -- all of the above should be accepted +-- some day we might allow these cases, but for now they're errors: +select array[]::oidvector; +ERROR: array is not a valid oidvector +select array[]::int2vector; +ERROR: array is not a valid int2vector -- tests for array aggregates CREATE TEMP TABLE arraggtest ( f1 INT[], f2 TEXT[][], f3 FLOAT[]) DISTRIBUTED RANDOMLY; INSERT INTO arraggtest (f1, f2, f3) VALUES diff --git a/src/test/regress/expected/encoding.out b/src/test/regress/expected/encoding.out new file mode 100644 index 00000000000..ea1f38cff41 --- /dev/null +++ b/src/test/regress/expected/encoding.out @@ -0,0 +1,401 @@ +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif +\getenv libdir PG_LIBDIR +\getenv dlsuffix PG_DLSUFFIX +\set regresslib :libdir '/regress' :dlsuffix +CREATE FUNCTION test_bytea_to_text(bytea) RETURNS text + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_text_to_bytea(text) RETURNS bytea + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_mblen_func(text, text, text, int) RETURNS int + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_text_to_wchars(text, text) RETURNS int[] + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_wchars_to_text(text, int[]) RETURNS text + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_valid_server_encoding(text) RETURNS boolean + AS :'regresslib' LANGUAGE C STRICT; +CREATE TABLE regress_encoding(good text, truncated text, with_nul text, truncated_with_nul text); +INSERT INTO regress_encoding +VALUES ('café', + 'caf' || test_bytea_to_text('\xc3'), + 'café' || test_bytea_to_text('\x00') || 'dcba', + 'caf' || test_bytea_to_text('\xc300') || 'dcba'); +SELECT good, truncated, with_nul FROM regress_encoding; + good | truncated | with_nul +------+-----------+---------- + café | caf | café +(1 row) + +SELECT length(good) FROM regress_encoding; + length +-------- + 4 +(1 row) + +SELECT substring(good, 3, 1) FROM regress_encoding; + substring +----------- + f +(1 row) + +SELECT substring(good, 4, 1) FROM regress_encoding; + substring +----------- + é +(1 row) + +SELECT regexp_replace(good, '^caf(.)$', '\1') FROM regress_encoding; + regexp_replace +---------------- + é +(1 row) + +SELECT reverse(good) FROM regress_encoding; + reverse +--------- + éfac +(1 row) + +-- invalid short mb character = error +SELECT length(truncated) FROM regress_encoding; +ERROR: invalid byte sequence for encoding "UTF8": 0xc3 +SELECT substring(truncated, 1, 1) FROM regress_encoding; +ERROR: invalid byte sequence for encoding "UTF8": 0xc3 +SELECT reverse(truncated) FROM regress_encoding; +ERROR: invalid byte sequence for encoding "UTF8": 0xc3 +-- invalid short mb character = silently dropped +SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding; + regexp_replace +---------------- + caf +(1 row) + +-- PostgreSQL doesn't allow strings to contain NUL. If a corrupted string +-- contains NUL at a character boundary position, some functions treat it as a +-- character while others treat it as a terminator, as implementation details. +-- NUL = terminator +SELECT length(with_nul) FROM regress_encoding; + length +-------- + 4 +(1 row) + +SELECT substring(with_nul, 3, 1) FROM regress_encoding; + substring +----------- + f +(1 row) + +SELECT substring(with_nul, 4, 1) FROM regress_encoding; + substring +----------- + é +(1 row) + +SELECT substring(with_nul, 5, 1) FROM regress_encoding; + substring +----------- + +(1 row) + +SELECT convert_to(substring(with_nul, 5, 1), 'UTF8') FROM regress_encoding; + convert_to +------------ + \x +(1 row) + +SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding; + regexp_replace +---------------- + é +(1 row) + +-- NUL = character +SELECT with_nul, reverse(with_nul), reverse(reverse(with_nul)) FROM regress_encoding; + with_nul | reverse | reverse +----------+---------+--------- + café | abcd | café +(1 row) + +-- If a corrupted string contains NUL in the tail bytes of a multibyte +-- character (invalid in all encodings), it is considered part of the +-- character for length purposes. An error will only be raised in code paths +-- that convert or verify encodings. +SELECT length(truncated_with_nul) FROM regress_encoding; + length +-------- + 8 +(1 row) + +SELECT substring(truncated_with_nul, 3, 1) FROM regress_encoding; + substring +----------- + f +(1 row) + +SELECT substring(truncated_with_nul, 4, 1) FROM regress_encoding; + substring +----------- + +(1 row) + +SELECT convert_to(substring(truncated_with_nul, 4, 1), 'UTF8') FROM regress_encoding; +ERROR: invalid byte sequence for encoding "UTF8": 0xc3 0x00 +SELECT substring(truncated_with_nul, 5, 1) FROM regress_encoding; + substring +----------- + d +(1 row) + +SELECT regexp_replace(truncated_with_nul, '^caf(.)dcba$', '\1') = test_bytea_to_text('\xc300') FROM regress_encoding; + ?column? +---------- + t +(1 row) + +SELECT reverse(truncated_with_nul) FROM regress_encoding; + reverse +--------- + abcd +(1 row) + +-- unbounded: sequence would overrun the string! +SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated, 3) +FROM regress_encoding; + test_mblen_func +----------------- + 2 +(1 row) + +-- condition detected when using the length/range variants +SELECT test_mblen_func('pg_mblen_with_len', 'UTF8', truncated, 3) +FROM regress_encoding; +ERROR: invalid byte sequence for encoding "UTF8": 0xc3 +SELECT test_mblen_func('pg_mblen_range', 'UTF8', truncated, 3) +FROM regress_encoding; +ERROR: invalid byte sequence for encoding "UTF8": 0xc3 +-- unbounded: sequence would overrun the string, if the terminator were really +-- the end of it +SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated_with_nul, 3) +FROM regress_encoding; + test_mblen_func +----------------- + 2 +(1 row) + +SELECT test_mblen_func('pg_encoding_mblen', 'GB18030', truncated_with_nul, 3) +FROM regress_encoding; + test_mblen_func +----------------- + 2 +(1 row) + +-- condition detected when using the cstr variants +SELECT test_mblen_func('pg_mblen_cstr', 'UTF8', truncated_with_nul, 3) +FROM regress_encoding; +ERROR: invalid byte sequence for encoding "UTF8": 0xc3 +DROP TABLE regress_encoding; +-- mb<->wchar conversions +CREATE FUNCTION test_encoding(encoding text, description text, input bytea) +RETURNS VOID LANGUAGE plpgsql AS +$$ +DECLARE + prefix text; + len int; + wchars int[]; + round_trip bytea; + result text; +BEGIN + prefix := rpad(encoding || ' ' || description || ':', 28); + + -- XXX could also test validation, length functions and include client + -- only encodings with these test cases + + IF test_valid_server_encoding(encoding) THEN + wchars := test_text_to_wchars(encoding, test_bytea_to_text(input)); + round_trip = test_text_to_bytea(test_wchars_to_text(encoding, wchars)); + if input = round_trip then + result := 'OK'; + elsif length(input) > length(round_trip) and round_trip = substr(input, 1, length(round_trip)) then + result := 'truncated'; + else + result := 'failed'; + end if; + RAISE NOTICE '% % -> % -> % = %', prefix, input, wchars, round_trip, result; + END IF; +END; +$$; +-- No validation is done on the encoding itself, just the length to avoid +-- overruns, so some of the byte sequences below are bogus. They cover +-- all code branches, server encodings only for now. +CREATE TABLE encoding_tests (encoding text, description text, input bytea); +INSERT INTO encoding_tests VALUES + -- LATIN1, other single-byte encodings + ('LATIN1', 'ASCII', 'a'), + ('LATIN1', 'extended', '\xe9'), + -- EUC_JP, EUC_JIS_2004, EUR_KR (for the purposes of wchar conversion): + -- 2 8e (CS2, not used by EUR_KR but arbitrarily considered to have EUC_JP length) + -- 3 8f (CS3, not used by EUR_KR but arbitrarily considered to have EUC_JP length) + -- 2 80..ff (CS1) + ('EUC_JP', 'ASCII', 'a'), + ('EUC_JP', 'CS1, short', '\x80'), + ('EUC_JP', 'CS1', '\x8002'), + ('EUC_JP', 'CS2, short', '\x8e'), + ('EUC_JP', 'CS2', '\x8e02'), + ('EUC_JP', 'CS3, short', '\x8f'), + ('EUC_JP', 'CS3, short', '\x8f02'), + ('EUC_JP', 'CS3', '\x8f0203'), + -- EUC_CN + -- 3 8e (CS2, not used but arbitrarily considered to have length 3) + -- 3 8f (CS3, not used but arbitrarily considered to have length 3) + -- 2 80..ff (CS1) + ('EUC_CN', 'ASCII', 'a'), + ('EUC_CN', 'CS1, short', '\x80'), + ('EUC_CN', 'CS1', '\x8002'), + ('EUC_CN', 'CS2, short', '\x8e'), + ('EUC_CN', 'CS2, short', '\x8e02'), + ('EUC_CN', 'CS2', '\x8e0203'), + ('EUC_CN', 'CS3, short', '\x8f'), + ('EUC_CN', 'CS3, short', '\x8f02'), + ('EUC_CN', 'CS3', '\x8f0203'), + -- EUC_TW: + -- 4 8e (CS2) + -- 3 8f (CS3, not used but arbitrarily considered to have length 3) + -- 2 80..ff (CS1) + ('EUC_TW', 'ASCII', 'a'), + ('EUC_TW', 'CS1, short', '\x80'), + ('EUC_TW', 'CS1', '\x8002'), + ('EUC_TW', 'CS2, short', '\x8e'), + ('EUC_TW', 'CS2, short', '\x8e02'), + ('EUC_TW', 'CS2, short', '\x8e0203'), + ('EUC_TW', 'CS2', '\x8e020304'), + ('EUC_TW', 'CS3, short', '\x8f'), + ('EUC_TW', 'CS3, short', '\x8f02'), + ('EUC_TW', 'CS3', '\x8f0203'), + -- UTF8 + -- 2 c0..df + -- 3 e0..ef + -- 4 f0..f7 (but maximum real codepoint U+10ffff has f4) + -- 5 f8..fb (not supported) + -- 6 fc..fd (not supported) + ('UTF8', 'ASCII', 'a'), + ('UTF8', '2 byte, short', '\xdf'), + ('UTF8', '2 byte', '\xdf82'), + ('UTF8', '3 byte, short', '\xef'), + ('UTF8', '3 byte, short', '\xef82'), + ('UTF8', '3 byte', '\xef8283'), + ('UTF8', '4 byte, short', '\xf7'), + ('UTF8', '4 byte, short', '\xf782'), + ('UTF8', '4 byte, short', '\xf78283'), + ('UTF8', '4 byte', '\xf7828384'), + ('UTF8', '5 byte, unsupported', '\xfb'), + ('UTF8', '5 byte, unsupported', '\xfb82'), + ('UTF8', '5 byte, unsupported', '\xfb8283'), + ('UTF8', '5 byte, unsupported', '\xfb828384'), + ('UTF8', '5 byte, unsupported', '\xfb82838485'), + ('UTF8', '6 byte, unsupported', '\xfd'), + ('UTF8', '6 byte, unsupported', '\xfd82'), + ('UTF8', '6 byte, unsupported', '\xfd8283'), + ('UTF8', '6 byte, unsupported', '\xfd828384'), + ('UTF8', '6 byte, unsupported', '\xfd82838485'), + ('UTF8', '6 byte, unsupported', '\xfd8283848586'), + -- MULE_INTERNAL + -- 2 81..8d LC1 + -- 3 90..99 LC2 + ('MULE_INTERNAL', 'ASCII', 'a'), + ('MULE_INTERNAL', 'LC1, short', '\x81'), + ('MULE_INTERNAL', 'LC1', '\x8182'), + ('MULE_INTERNAL', 'LC2, short', '\x90'), + ('MULE_INTERNAL', 'LC2, short', '\x9082'), + ('MULE_INTERNAL', 'LC2', '\x908283'); +SELECT COUNT(test_encoding(encoding, description, input)) > 0 +FROM encoding_tests; +NOTICE: LATIN1 ASCII: \x61 -> {97} -> \x61 = OK +NOTICE: LATIN1 extended: \xe9 -> {233} -> \xe9 = OK +NOTICE: EUC_JP ASCII: \x61 -> {97} -> \x61 = OK +NOTICE: EUC_JP CS1, short: \x80 -> {} -> \x = truncated +NOTICE: EUC_JP CS1: \x8002 -> {32770} -> \x8002 = OK +NOTICE: EUC_JP CS2, short: \x8e -> {} -> \x = truncated +NOTICE: EUC_JP CS2: \x8e02 -> {36354} -> \x8e02 = OK +NOTICE: EUC_JP CS3, short: \x8f -> {} -> \x = truncated +NOTICE: EUC_JP CS3, short: \x8f02 -> {} -> \x = truncated +NOTICE: EUC_JP CS3: \x8f0203 -> {9372163} -> \x8f0203 = OK +NOTICE: EUC_CN ASCII: \x61 -> {97} -> \x61 = OK +NOTICE: EUC_CN CS1, short: \x80 -> {} -> \x = truncated +NOTICE: EUC_CN CS1: \x8002 -> {32770} -> \x8002 = OK +NOTICE: EUC_CN CS2, short: \x8e -> {} -> \x = truncated +NOTICE: EUC_CN CS2, short: \x8e02 -> {} -> \x = truncated +NOTICE: EUC_CN CS2: \x8e0203 -> {9306627} -> \x8e0203 = OK +NOTICE: EUC_CN CS3, short: \x8f -> {} -> \x = truncated +NOTICE: EUC_CN CS3, short: \x8f02 -> {} -> \x = truncated +NOTICE: EUC_CN CS3: \x8f0203 -> {9372163} -> \x8f0203 = OK +NOTICE: EUC_TW ASCII: \x61 -> {97} -> \x61 = OK +NOTICE: EUC_TW CS1, short: \x80 -> {} -> \x = truncated +NOTICE: EUC_TW CS1: \x8002 -> {32770} -> \x8002 = OK +NOTICE: EUC_TW CS2, short: \x8e -> {} -> \x = truncated +NOTICE: EUC_TW CS2, short: \x8e02 -> {} -> \x = truncated +NOTICE: EUC_TW CS2, short: \x8e0203 -> {} -> \x = truncated +NOTICE: EUC_TW CS2: \x8e020304 -> {-1912470780} -> \x8e020304 = OK +NOTICE: EUC_TW CS3, short: \x8f -> {} -> \x = truncated +NOTICE: EUC_TW CS3, short: \x8f02 -> {} -> \x = truncated +NOTICE: EUC_TW CS3: \x8f0203 -> {9372163} -> \x8f0203 = OK +NOTICE: UTF8 ASCII: \x61 -> {97} -> \x61 = OK +NOTICE: UTF8 2 byte, short: \xdf -> {} -> \x = truncated +NOTICE: UTF8 2 byte: \xdf82 -> {1986} -> \xdf82 = OK +NOTICE: UTF8 3 byte, short: \xef -> {} -> \x = truncated +NOTICE: UTF8 3 byte, short: \xef82 -> {} -> \x = truncated +NOTICE: UTF8 3 byte: \xef8283 -> {61571} -> \xef8283 = OK +NOTICE: UTF8 4 byte, short: \xf7 -> {} -> \x = truncated +NOTICE: UTF8 4 byte, short: \xf782 -> {} -> \x = truncated +NOTICE: UTF8 4 byte, short: \xf78283 -> {} -> \x = truncated +NOTICE: UTF8 4 byte: \xf7828384 -> {1843396} -> \xf7828384 = OK +NOTICE: UTF8 5 byte, unsupported: \xfb -> {251} -> \xc3bb = failed +NOTICE: UTF8 5 byte, unsupported: \xfb82 -> {251,130} -> \xc3bbc282 = failed +NOTICE: UTF8 5 byte, unsupported: \xfb8283 -> {251,130,131} -> \xc3bbc282c283 = failed +NOTICE: UTF8 5 byte, unsupported: \xfb828384 -> {251,130,131,132} -> \xc3bbc282c283c284 = failed +NOTICE: UTF8 5 byte, unsupported: \xfb82838485 -> {251,130,131,132,133} -> \xc3bbc282c283c284c285 = failed +NOTICE: UTF8 6 byte, unsupported: \xfd -> {253} -> \xc3bd = failed +NOTICE: UTF8 6 byte, unsupported: \xfd82 -> {253,130} -> \xc3bdc282 = failed +NOTICE: UTF8 6 byte, unsupported: \xfd8283 -> {253,130,131} -> \xc3bdc282c283 = failed +NOTICE: UTF8 6 byte, unsupported: \xfd828384 -> {253,130,131,132} -> \xc3bdc282c283c284 = failed +NOTICE: UTF8 6 byte, unsupported: \xfd82838485 -> {253,130,131,132,133} -> \xc3bdc282c283c284c285 = failed +NOTICE: UTF8 6 byte, unsupported: \xfd8283848586 -> {253,130,131,132,133,134} -> \xc3bdc282c283c284c285c286 = failed +NOTICE: MULE_INTERNAL ASCII: \x61 -> {97} -> \x61 = OK +NOTICE: MULE_INTERNAL LC1, short: \x81 -> {} -> \x = truncated +NOTICE: MULE_INTERNAL LC1: \x8182 -> {8454274} -> \x8182 = OK +NOTICE: MULE_INTERNAL LC2, short: \x90 -> {} -> \x = truncated +NOTICE: MULE_INTERNAL LC2, short: \x9082 -> {} -> \x = truncated +NOTICE: MULE_INTERNAL LC2: \x908283 -> {9470595} -> \x908283 = OK + ?column? +---------- + t +(1 row) + +DROP TABLE encoding_tests; +DROP FUNCTION test_encoding; +DROP FUNCTION test_text_to_wchars; +DROP FUNCTION test_mblen_func; +DROP FUNCTION test_bytea_to_text; +DROP FUNCTION test_text_to_bytea; +-- substring slow path: multi-byte escape char vs. multi-byte pattern char. +SELECT SUBSTRING('a' SIMILAR U&'\00AC' ESCAPE U&'\00A7'); + substring +----------- + +(1 row) + +-- Levenshtein distance metric: exercise character length cache. +SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name); +ERROR: column "real§_name" does not exist +LINE 1: SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name); + ^ +HINT: Perhaps you meant to reference the column "x.real_name". +-- JSON errcontext: truncate long data. +SELECT repeat(U&'\00A7', 30)::json; +ERROR: invalid input syntax for type json +DETAIL: Token "§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§" is invalid. +CONTEXT: JSON data, line 1: ...§§§§§§§§§§§§§§§§§§§§§§§§ diff --git a/src/test/regress/expected/encoding_1.out b/src/test/regress/expected/encoding_1.out new file mode 100644 index 00000000000..a5b02090901 --- /dev/null +++ b/src/test/regress/expected/encoding_1.out @@ -0,0 +1,4 @@ +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit diff --git a/src/test/regress/expected/euc_kr.out b/src/test/regress/expected/euc_kr.out new file mode 100644 index 00000000000..7a61c89a43a --- /dev/null +++ b/src/test/regress/expected/euc_kr.out @@ -0,0 +1,16 @@ +-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent +-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all +-- of EUC_KR, also run the test in UTF8. +SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset +\if :skip_test +\quit +\endif +-- Exercise is_multibyte_char_in_char (non-UTF8) slow path. +SELECT POSITION( + convert_from('\xbcf6c7d0', 'EUC_KR') IN + convert_from('\xb0fac7d02c20bcf6c7d02c20b1e2bcfa2c20bbee', 'EUC_KR')); + position +---------- + 5 +(1 row) + diff --git a/src/test/regress/expected/euc_kr_1.out b/src/test/regress/expected/euc_kr_1.out new file mode 100644 index 00000000000..faaac5d6355 --- /dev/null +++ b/src/test/regress/expected/euc_kr_1.out @@ -0,0 +1,6 @@ +-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent +-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all +-- of EUC_KR, also run the test in UTF8. +SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset +\if :skip_test +\quit diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index bc5918a22b8..c4b5a58713c 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -32,7 +32,7 @@ test: strings md5 numerology point lseg line box path polygon circle date time t # geometry depends on point, lseg, line, box, path, polygon, circle # horology depends on date, time, timetz, timestamp, timestamptz, interval # ---------- -test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database +test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database encoding euc_kr # ---------- # Load huge amounts of data diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c index 0fc787c1aaf..cd4d1df4ef0 100644 --- a/src/test/regress/regress.c +++ b/src/test/regress/regress.c @@ -1285,6 +1285,145 @@ test_enc_conversion(PG_FUNCTION_ARGS) PG_RETURN_DATUM(HeapTupleGetDatum(tuple)); } +/* Convert bytea to text without validation for corruption tests from SQL. */ +PG_FUNCTION_INFO_V1(test_bytea_to_text); +Datum +test_bytea_to_text(PG_FUNCTION_ARGS) +{ + PG_RETURN_TEXT_P(PG_GETARG_BYTEA_PP(0)); +} + +/* And the reverse. */ +PG_FUNCTION_INFO_V1(test_text_to_bytea); +Datum +test_text_to_bytea(PG_FUNCTION_ARGS) +{ + PG_RETURN_BYTEA_P(PG_GETARG_TEXT_PP(0)); +} + +/* Corruption tests in C. */ +PG_FUNCTION_INFO_V1(test_mblen_func); +Datum +test_mblen_func(PG_FUNCTION_ARGS) +{ + const char *func = text_to_cstring(PG_GETARG_BYTEA_PP(0)); + const char *encoding = text_to_cstring(PG_GETARG_BYTEA_PP(1)); + text *string = PG_GETARG_BYTEA_PP(2); + int offset = PG_GETARG_INT32(3); + const char *data = VARDATA_ANY(string); + size_t size = VARSIZE_ANY_EXHDR(string); + int result = 0; + + if (strcmp(func, "pg_mblen_unbounded") == 0) + result = pg_mblen_unbounded(data + offset); + else if (strcmp(func, "pg_mblen_cstr") == 0) + result = pg_mblen_cstr(data + offset); + else if (strcmp(func, "pg_mblen_with_len") == 0) + result = pg_mblen_with_len(data + offset, size - offset); + else if (strcmp(func, "pg_mblen_range") == 0) + result = pg_mblen_range(data + offset, data + size); + else if (strcmp(func, "pg_encoding_mblen") == 0) + result = pg_encoding_mblen(pg_char_to_encoding(encoding), data + offset); + else + elog(ERROR, "unknown function"); + + PG_RETURN_INT32(result); +} + +PG_FUNCTION_INFO_V1(test_text_to_wchars); +Datum +test_text_to_wchars(PG_FUNCTION_ARGS) +{ + const char *encoding_name = text_to_cstring(PG_GETARG_BYTEA_PP(0)); + text *string = PG_GETARG_TEXT_PP(1); + const char *data = VARDATA_ANY(string); + size_t size = VARSIZE_ANY_EXHDR(string); + pg_wchar *wchars = palloc(sizeof(pg_wchar) * (size + 1)); + Datum *datums; + int wlen; + int encoding; + + encoding = pg_char_to_encoding(encoding_name); + if (encoding < 0) + elog(ERROR, "unknown encoding name: %s", encoding_name); + + if (size > 0) + { + datums = palloc(sizeof(Datum) * size); + wlen = pg_encoding_mb2wchar_with_len(encoding, + data, + wchars, + size); + Assert(wlen >= 0); + Assert(wlen <= size); + Assert(wchars[wlen] == 0); + + for (int i = 0; i < wlen; ++i) + datums[i] = UInt32GetDatum(wchars[i]); + } + else + { + datums = NULL; + wlen = 0; + } + + PG_RETURN_ARRAYTYPE_P(construct_array_builtin(datums, wlen, INT4OID)); +} + +PG_FUNCTION_INFO_V1(test_wchars_to_text); +Datum +test_wchars_to_text(PG_FUNCTION_ARGS) +{ + const char *encoding_name = text_to_cstring(PG_GETARG_BYTEA_PP(0)); + ArrayType *array = PG_GETARG_ARRAYTYPE_P(1); + Datum *datums; + bool *nulls; + char *mb; + text *result; + int wlen; + int bytes; + int encoding; + + encoding = pg_char_to_encoding(encoding_name); + if (encoding < 0) + elog(ERROR, "unknown encoding name: %s", encoding_name); + + deconstruct_array_builtin(array, INT4OID, &datums, &nulls, &wlen); + + if (wlen > 0) + { + pg_wchar *wchars = palloc(sizeof(pg_wchar) * wlen); + + for (int i = 0; i < wlen; ++i) + { + if (nulls[i]) + elog(ERROR, "unexpected NULL in array"); + wchars[i] = DatumGetInt32(datums[i]); + } + + mb = palloc(pg_encoding_max_length(encoding) * wlen + 1); + bytes = pg_encoding_wchar2mb_with_len(encoding, wchars, mb, wlen); + } + else + { + mb = ""; + bytes = 0; + } + + result = palloc(bytes + VARHDRSZ); + SET_VARSIZE(result, bytes + VARHDRSZ); + memcpy(VARDATA(result), mb, bytes); + + PG_RETURN_TEXT_P(result); +} + +PG_FUNCTION_INFO_V1(test_valid_server_encoding); +Datum +test_valid_server_encoding(PG_FUNCTION_ARGS) +{ + return pg_valid_server_encoding(text_to_cstring(PG_GETARG_TEXT_PP(0))); +} + /* Provide SQL access to IsBinaryCoercible() */ PG_FUNCTION_INFO_V1(binary_coercible); Datum diff --git a/src/test/regress/sql/arrays.sql b/src/test/regress/sql/arrays.sql index aabbd6f92de..075e6949846 100644 --- a/src/test/regress/sql/arrays.sql +++ b/src/test/regress/sql/arrays.sql @@ -502,6 +502,10 @@ select array[]::text[]; select '[0:1]={1.1,2.2}'::float8[]; -- all of the above should be accepted +-- some day we might allow these cases, but for now they're errors: +select array[]::oidvector; +select array[]::int2vector; + -- tests for array aggregates CREATE TEMP TABLE arraggtest ( f1 INT[], f2 TEXT[][], f3 FLOAT[]) DISTRIBUTED RANDOMLY; diff --git a/src/test/regress/sql/encoding.sql b/src/test/regress/sql/encoding.sql new file mode 100644 index 00000000000..b9543c0cb32 --- /dev/null +++ b/src/test/regress/sql/encoding.sql @@ -0,0 +1,228 @@ +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif + +\getenv libdir PG_LIBDIR +\getenv dlsuffix PG_DLSUFFIX + +\set regresslib :libdir '/regress' :dlsuffix + +CREATE FUNCTION test_bytea_to_text(bytea) RETURNS text + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_text_to_bytea(text) RETURNS bytea + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_mblen_func(text, text, text, int) RETURNS int + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_text_to_wchars(text, text) RETURNS int[] + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_wchars_to_text(text, int[]) RETURNS text + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_valid_server_encoding(text) RETURNS boolean + AS :'regresslib' LANGUAGE C STRICT; + + +CREATE TABLE regress_encoding(good text, truncated text, with_nul text, truncated_with_nul text); +INSERT INTO regress_encoding +VALUES ('café', + 'caf' || test_bytea_to_text('\xc3'), + 'café' || test_bytea_to_text('\x00') || 'dcba', + 'caf' || test_bytea_to_text('\xc300') || 'dcba'); + +SELECT good, truncated, with_nul FROM regress_encoding; + +SELECT length(good) FROM regress_encoding; +SELECT substring(good, 3, 1) FROM regress_encoding; +SELECT substring(good, 4, 1) FROM regress_encoding; +SELECT regexp_replace(good, '^caf(.)$', '\1') FROM regress_encoding; +SELECT reverse(good) FROM regress_encoding; + +-- invalid short mb character = error +SELECT length(truncated) FROM regress_encoding; +SELECT substring(truncated, 1, 1) FROM regress_encoding; +SELECT reverse(truncated) FROM regress_encoding; +-- invalid short mb character = silently dropped +SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding; + +-- PostgreSQL doesn't allow strings to contain NUL. If a corrupted string +-- contains NUL at a character boundary position, some functions treat it as a +-- character while others treat it as a terminator, as implementation details. + +-- NUL = terminator +SELECT length(with_nul) FROM regress_encoding; +SELECT substring(with_nul, 3, 1) FROM regress_encoding; +SELECT substring(with_nul, 4, 1) FROM regress_encoding; +SELECT substring(with_nul, 5, 1) FROM regress_encoding; +SELECT convert_to(substring(with_nul, 5, 1), 'UTF8') FROM regress_encoding; +SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding; +-- NUL = character +SELECT with_nul, reverse(with_nul), reverse(reverse(with_nul)) FROM regress_encoding; + +-- If a corrupted string contains NUL in the tail bytes of a multibyte +-- character (invalid in all encodings), it is considered part of the +-- character for length purposes. An error will only be raised in code paths +-- that convert or verify encodings. + +SELECT length(truncated_with_nul) FROM regress_encoding; +SELECT substring(truncated_with_nul, 3, 1) FROM regress_encoding; +SELECT substring(truncated_with_nul, 4, 1) FROM regress_encoding; +SELECT convert_to(substring(truncated_with_nul, 4, 1), 'UTF8') FROM regress_encoding; +SELECT substring(truncated_with_nul, 5, 1) FROM regress_encoding; +SELECT regexp_replace(truncated_with_nul, '^caf(.)dcba$', '\1') = test_bytea_to_text('\xc300') FROM regress_encoding; +SELECT reverse(truncated_with_nul) FROM regress_encoding; + +-- unbounded: sequence would overrun the string! +SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated, 3) +FROM regress_encoding; + +-- condition detected when using the length/range variants +SELECT test_mblen_func('pg_mblen_with_len', 'UTF8', truncated, 3) +FROM regress_encoding; +SELECT test_mblen_func('pg_mblen_range', 'UTF8', truncated, 3) +FROM regress_encoding; + +-- unbounded: sequence would overrun the string, if the terminator were really +-- the end of it +SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated_with_nul, 3) +FROM regress_encoding; +SELECT test_mblen_func('pg_encoding_mblen', 'GB18030', truncated_with_nul, 3) +FROM regress_encoding; + +-- condition detected when using the cstr variants +SELECT test_mblen_func('pg_mblen_cstr', 'UTF8', truncated_with_nul, 3) +FROM regress_encoding; + +DROP TABLE regress_encoding; + +-- mb<->wchar conversions +CREATE FUNCTION test_encoding(encoding text, description text, input bytea) +RETURNS VOID LANGUAGE plpgsql AS +$$ +DECLARE + prefix text; + len int; + wchars int[]; + round_trip bytea; + result text; +BEGIN + prefix := rpad(encoding || ' ' || description || ':', 28); + + -- XXX could also test validation, length functions and include client + -- only encodings with these test cases + + IF test_valid_server_encoding(encoding) THEN + wchars := test_text_to_wchars(encoding, test_bytea_to_text(input)); + round_trip = test_text_to_bytea(test_wchars_to_text(encoding, wchars)); + if input = round_trip then + result := 'OK'; + elsif length(input) > length(round_trip) and round_trip = substr(input, 1, length(round_trip)) then + result := 'truncated'; + else + result := 'failed'; + end if; + RAISE NOTICE '% % -> % -> % = %', prefix, input, wchars, round_trip, result; + END IF; +END; +$$; +-- No validation is done on the encoding itself, just the length to avoid +-- overruns, so some of the byte sequences below are bogus. They cover +-- all code branches, server encodings only for now. +CREATE TABLE encoding_tests (encoding text, description text, input bytea); +INSERT INTO encoding_tests VALUES + -- LATIN1, other single-byte encodings + ('LATIN1', 'ASCII', 'a'), + ('LATIN1', 'extended', '\xe9'), + -- EUC_JP, EUC_JIS_2004, EUR_KR (for the purposes of wchar conversion): + -- 2 8e (CS2, not used by EUR_KR but arbitrarily considered to have EUC_JP length) + -- 3 8f (CS3, not used by EUR_KR but arbitrarily considered to have EUC_JP length) + -- 2 80..ff (CS1) + ('EUC_JP', 'ASCII', 'a'), + ('EUC_JP', 'CS1, short', '\x80'), + ('EUC_JP', 'CS1', '\x8002'), + ('EUC_JP', 'CS2, short', '\x8e'), + ('EUC_JP', 'CS2', '\x8e02'), + ('EUC_JP', 'CS3, short', '\x8f'), + ('EUC_JP', 'CS3, short', '\x8f02'), + ('EUC_JP', 'CS3', '\x8f0203'), + -- EUC_CN + -- 3 8e (CS2, not used but arbitrarily considered to have length 3) + -- 3 8f (CS3, not used but arbitrarily considered to have length 3) + -- 2 80..ff (CS1) + ('EUC_CN', 'ASCII', 'a'), + ('EUC_CN', 'CS1, short', '\x80'), + ('EUC_CN', 'CS1', '\x8002'), + ('EUC_CN', 'CS2, short', '\x8e'), + ('EUC_CN', 'CS2, short', '\x8e02'), + ('EUC_CN', 'CS2', '\x8e0203'), + ('EUC_CN', 'CS3, short', '\x8f'), + ('EUC_CN', 'CS3, short', '\x8f02'), + ('EUC_CN', 'CS3', '\x8f0203'), + -- EUC_TW: + -- 4 8e (CS2) + -- 3 8f (CS3, not used but arbitrarily considered to have length 3) + -- 2 80..ff (CS1) + ('EUC_TW', 'ASCII', 'a'), + ('EUC_TW', 'CS1, short', '\x80'), + ('EUC_TW', 'CS1', '\x8002'), + ('EUC_TW', 'CS2, short', '\x8e'), + ('EUC_TW', 'CS2, short', '\x8e02'), + ('EUC_TW', 'CS2, short', '\x8e0203'), + ('EUC_TW', 'CS2', '\x8e020304'), + ('EUC_TW', 'CS3, short', '\x8f'), + ('EUC_TW', 'CS3, short', '\x8f02'), + ('EUC_TW', 'CS3', '\x8f0203'), + -- UTF8 + -- 2 c0..df + -- 3 e0..ef + -- 4 f0..f7 (but maximum real codepoint U+10ffff has f4) + -- 5 f8..fb (not supported) + -- 6 fc..fd (not supported) + ('UTF8', 'ASCII', 'a'), + ('UTF8', '2 byte, short', '\xdf'), + ('UTF8', '2 byte', '\xdf82'), + ('UTF8', '3 byte, short', '\xef'), + ('UTF8', '3 byte, short', '\xef82'), + ('UTF8', '3 byte', '\xef8283'), + ('UTF8', '4 byte, short', '\xf7'), + ('UTF8', '4 byte, short', '\xf782'), + ('UTF8', '4 byte, short', '\xf78283'), + ('UTF8', '4 byte', '\xf7828384'), + ('UTF8', '5 byte, unsupported', '\xfb'), + ('UTF8', '5 byte, unsupported', '\xfb82'), + ('UTF8', '5 byte, unsupported', '\xfb8283'), + ('UTF8', '5 byte, unsupported', '\xfb828384'), + ('UTF8', '5 byte, unsupported', '\xfb82838485'), + ('UTF8', '6 byte, unsupported', '\xfd'), + ('UTF8', '6 byte, unsupported', '\xfd82'), + ('UTF8', '6 byte, unsupported', '\xfd8283'), + ('UTF8', '6 byte, unsupported', '\xfd828384'), + ('UTF8', '6 byte, unsupported', '\xfd82838485'), + ('UTF8', '6 byte, unsupported', '\xfd8283848586'), + -- MULE_INTERNAL + -- 2 81..8d LC1 + -- 3 90..99 LC2 + ('MULE_INTERNAL', 'ASCII', 'a'), + ('MULE_INTERNAL', 'LC1, short', '\x81'), + ('MULE_INTERNAL', 'LC1', '\x8182'), + ('MULE_INTERNAL', 'LC2, short', '\x90'), + ('MULE_INTERNAL', 'LC2, short', '\x9082'), + ('MULE_INTERNAL', 'LC2', '\x908283'); + +SELECT COUNT(test_encoding(encoding, description, input)) > 0 +FROM encoding_tests; + +DROP TABLE encoding_tests; +DROP FUNCTION test_encoding; +DROP FUNCTION test_text_to_wchars; +DROP FUNCTION test_mblen_func; +DROP FUNCTION test_bytea_to_text; +DROP FUNCTION test_text_to_bytea; + + +-- substring slow path: multi-byte escape char vs. multi-byte pattern char. +SELECT SUBSTRING('a' SIMILAR U&'\00AC' ESCAPE U&'\00A7'); +-- Levenshtein distance metric: exercise character length cache. +SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name); +-- JSON errcontext: truncate long data. +SELECT repeat(U&'\00A7', 30)::json; diff --git a/src/test/regress/sql/euc_kr.sql b/src/test/regress/sql/euc_kr.sql new file mode 100644 index 00000000000..1851b2a8c14 --- /dev/null +++ b/src/test/regress/sql/euc_kr.sql @@ -0,0 +1,12 @@ +-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent +-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all +-- of EUC_KR, also run the test in UTF8. +SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset +\if :skip_test +\quit +\endif + +-- Exercise is_multibyte_char_in_char (non-UTF8) slow path. +SELECT POSITION( + convert_from('\xbcf6c7d0', 'EUC_KR') IN + convert_from('\xb0fac7d02c20bcf6c7d02c20b1e2bcfa2c20bbee', 'EUC_KR')); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 97ae28337d3..0b1b1df3b4a 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -737,6 +737,7 @@ ExtensibleNodeEntry ExtensibleNodeMethods ExtensionControlFile ExtensionInfo +ExtensionSiblingCache ExtensionVersionInfo FDWCollateState FD_SET