From 1f205920234bfbdc2f02010cbe88b80845e6e664 Mon Sep 17 00:00:00 2001 From: Hiroshi SHIBATA Date: Wed, 1 Jul 2026 06:48:28 +0900 Subject: [PATCH 01/16] Add experimental opt-in libfyaml backend Built only with --enable-libfyaml; without the flag the default libyaml backend is unchanged. The parser and emitter are reimplemented against libfyaml's event API in separate translation units guarded by PSYCH_USE_LIBFYAML, and the backend is not supported on Windows. Co-Authored-By: Claude Opus 4.8 --- ext/psych/extconf.rb | 16 ++ ext/psych/psych.c | 28 +++ ext/psych/psych.h | 4 + ext/psych/psych_emitter.c | 4 + ext/psych/psych_emitter_fy.c | 429 +++++++++++++++++++++++++++++++ ext/psych/psych_parser.c | 4 + ext/psych/psych_parser_fy.c | 476 +++++++++++++++++++++++++++++++++++ 7 files changed, 961 insertions(+) create mode 100644 ext/psych/psych_emitter_fy.c create mode 100644 ext/psych/psych_parser_fy.c diff --git a/ext/psych/extconf.rb b/ext/psych/extconf.rb index 589e201c..e1ea7510 100644 --- a/ext/psych/extconf.rb +++ b/ext/psych/extconf.rb @@ -2,6 +2,22 @@ # frozen_string_literal: true require 'mkmf' +# Experimental, opt-in libfyaml backend. Only used when psych is built with +# --enable-libfyaml. Without the flag nothing below changes and the default +# libyaml backend is built exactly as before. +if enable_config("libfyaml", false) + if $mswin or $mingw or $cygwin + abort "The libfyaml backend (--enable-libfyaml) is not supported on Windows" + end + unless pkg_config('libfyaml') + abort "libfyaml was requested with --enable-libfyaml but was not found via pkg-config" + end + $defs << "-DPSYCH_USE_LIBFYAML" + + create_makefile 'psych' + return +end + if $mswin or $mingw or $cygwin $CPPFLAGS << " -DYAML_DECLARE_STATIC" end diff --git a/ext/psych/psych.c b/ext/psych/psych.c index afbd7a35..2a5db212 100644 --- a/ext/psych/psych.c +++ b/ext/psych/psych.c @@ -9,7 +9,16 @@ static VALUE libyaml_version(VALUE module) int major, minor, patch; VALUE list[3]; +#ifdef PSYCH_USE_LIBFYAML + /* Experimental libfyaml backend: there is no libyaml linked in. Report + * the libfyaml version so callers still receive a 3-element version. */ + const struct fy_version *v = fy_version_default(); + major = v ? v->major : 0; + minor = v ? v->minor : 0; + patch = 0; +#else yaml_get_version(&major, &minor, &patch); +#endif list[0] = INT2NUM(major); list[1] = INT2NUM(minor); @@ -18,6 +27,18 @@ static VALUE libyaml_version(VALUE module) return rb_ary_new4((long)3, list); } +#ifdef PSYCH_USE_LIBFYAML +/* call-seq: Psych.libfyaml_version + * + * Returns the libfyaml version string, or nil when not built with libfyaml. + */ +static VALUE libfyaml_version(VALUE module) +{ + const char *v = fy_library_version(); + return v ? rb_usascii_str_new2(v) : Qnil; +} +#endif + VALUE mPsych; void Init_psych(void) @@ -29,6 +50,13 @@ void Init_psych(void) rb_define_singleton_method(mPsych, "libyaml_version", libyaml_version, 0); +#ifdef PSYCH_USE_LIBFYAML + rb_define_singleton_method(mPsych, "libfyaml_version", libfyaml_version, 0); + rb_define_const(mPsych, "BACKEND", rb_usascii_str_new2("libfyaml")); +#else + rb_define_const(mPsych, "BACKEND", rb_usascii_str_new2("libyaml")); +#endif + Init_psych_parser(); Init_psych_emitter(); Init_psych_to_ruby(); diff --git a/ext/psych/psych.h b/ext/psych/psych.h index 6b3d63f2..0e146588 100644 --- a/ext/psych/psych.h +++ b/ext/psych/psych.h @@ -4,7 +4,11 @@ #include #include +#ifdef PSYCH_USE_LIBFYAML +#include +#else #include +#endif #include #include diff --git a/ext/psych/psych_emitter.c b/ext/psych/psych_emitter.c index 624ab7c5..187aebc3 100644 --- a/ext/psych/psych_emitter.c +++ b/ext/psych/psych_emitter.c @@ -1,5 +1,7 @@ #include +#ifndef PSYCH_USE_LIBFYAML + #if !defined(RARRAY_CONST_PTR) #define RARRAY_CONST_PTR(s) (const VALUE *)RARRAY_PTR(s) #endif @@ -587,3 +589,5 @@ void Init_psych_emitter(void) id_indentation = rb_intern("indentation"); id_canonical = rb_intern("canonical"); } + +#endif /* PSYCH_USE_LIBFYAML */ diff --git a/ext/psych/psych_emitter_fy.c b/ext/psych/psych_emitter_fy.c new file mode 100644 index 00000000..82679200 --- /dev/null +++ b/ext/psych/psych_emitter_fy.c @@ -0,0 +1,429 @@ +#include + +#ifdef PSYCH_USE_LIBFYAML +/* + * Experimental libfyaml-backed emitter. Only compiled when psych is built + * with --enable-libfyaml. Mirrors ext/psych/psych_emitter.c. + */ + +#if !defined(RARRAY_CONST_PTR) +#define RARRAY_CONST_PTR(s) (const VALUE *)RARRAY_PTR(s) +#endif +#if !defined(RARRAY_AREF) +#define RARRAY_AREF(a, i) RARRAY_CONST_PTR(a)[i] +#endif + +VALUE cPsychEmitter; +static ID id_io; +static ID id_write; +static ID id_line_width; +static ID id_indentation; +static ID id_canonical; + +typedef struct { + struct fy_emitter *emit; + struct fy_emitter_cfg cfg; + int indent; + int width; + int canonical; +} psych_fy_emitter_t; + +static int emitter_output(struct fy_emitter *emit, enum fy_emitter_write_type type, + const char *str, int len, void *userdata) +{ + VALUE self = (VALUE)userdata; + VALUE io = rb_attr_get(self, id_io); + VALUE s = rb_enc_str_new(str, (long)len, rb_utf8_encoding()); + rb_funcall(io, id_write, 1, s); + return len; +} + +static void dealloc(void *ptr) +{ + psych_fy_emitter_t *e = (psych_fy_emitter_t *)ptr; + if (e->emit) { + fy_emitter_destroy(e->emit); + } + xfree(e); +} + +static const rb_data_type_t psych_emitter_type = { + "Psych/emitter", + {0, dealloc, 0,}, + 0, 0, +#ifdef RUBY_TYPED_FREE_IMMEDIATELY + RUBY_TYPED_FREE_IMMEDIATELY, +#endif +}; + +static VALUE allocate(VALUE klass) +{ + psych_fy_emitter_t *e; + VALUE obj = TypedData_Make_Struct(klass, psych_fy_emitter_t, &psych_emitter_type, e); + + e->emit = NULL; + e->indent = 2; + e->width = -1; + e->canonical = 0; + + return obj; +} + +static unsigned int build_flags(psych_fy_emitter_t *e) +{ + unsigned int flags = FYECF_MODE_ORIGINAL | + FYECF_DOC_START_MARK_AUTO | FYECF_DOC_END_MARK_AUTO; + int indent = (e->indent >= 1 && e->indent <= 9) ? e->indent : 2; + flags |= FYECF_INDENT(indent); + if (e->width <= 0) { + flags |= FYECF_WIDTH_INF; + } else { + flags |= FYECF_WIDTH(e->width > 255 ? 255 : e->width); + } + return flags; +} + +/* (Re)create the underlying fy_emitter from the current option state. Safe to + * call before any event has been emitted. */ +static void rebuild_emitter(VALUE self, psych_fy_emitter_t *e) +{ + if (e->emit) { + fy_emitter_destroy(e->emit); + e->emit = NULL; + } + e->cfg.flags = build_flags(e); + e->cfg.output = emitter_output; + e->cfg.userdata = (void *)self; + e->cfg.diag = NULL; + e->emit = fy_emitter_create(&e->cfg); + if (!e->emit) { + rb_raise(rb_eNoMemError, "could not create libfyaml emitter"); + } +} + +static void do_emit(psych_fy_emitter_t *e, struct fy_event *event) +{ + if (!event) { + rb_raise(rb_eRuntimeError, "libfyaml: could not create event"); + } + if (fy_emit_event(e->emit, event) != 0) { + rb_raise(rb_eRuntimeError, "libfyaml: emit failed"); + } +} + +/* call-seq: Psych::Emitter.new(io, options = Psych::Emitter::OPTIONS) */ +static VALUE initialize(int argc, VALUE *argv, VALUE self) +{ + psych_fy_emitter_t *e; + VALUE io, options; + + TypedData_Get_Struct(self, psych_fy_emitter_t, &psych_emitter_type, e); + + if (rb_scan_args(argc, argv, "11", &io, &options) == 2) { + e->width = NUM2INT(rb_funcall(options, id_line_width, 0)); + e->indent = NUM2INT(rb_funcall(options, id_indentation, 0)); + e->canonical = (Qtrue == rb_funcall(options, id_canonical, 0)) ? 1 : 0; + } + + rb_ivar_set(self, id_io, io); + rebuild_emitter(self, e); + + return self; +} + +static VALUE start_stream(VALUE self, VALUE encoding) +{ + psych_fy_emitter_t *e; + TypedData_Get_Struct(self, psych_fy_emitter_t, &psych_emitter_type, e); + Check_Type(encoding, T_FIXNUM); + + do_emit(e, fy_emit_event_create(e->emit, FYET_STREAM_START)); + return self; +} + +static VALUE end_stream(VALUE self) +{ + psych_fy_emitter_t *e; + TypedData_Get_Struct(self, psych_fy_emitter_t, &psych_emitter_type, e); + + do_emit(e, fy_emit_event_create(e->emit, FYET_STREAM_END)); + return self; +} + +static VALUE start_document(VALUE self, VALUE version, VALUE tags, VALUE imp) +{ + psych_fy_emitter_t *e; + struct fy_version ver; + const struct fy_version *verp = NULL; + struct fy_tag *tag_storage = NULL; + const struct fy_tag **tag_ptrs = NULL; + VALUE *exported = NULL; + long len = 0; + + TypedData_Get_Struct(self, psych_fy_emitter_t, &psych_emitter_type, e); + Check_Type(version, T_ARRAY); + + if (RARRAY_LEN(version) >= 2) { + ver.major = NUM2INT(rb_ary_entry(version, 0)); + ver.minor = NUM2INT(rb_ary_entry(version, 1)); + verp = &ver; + } + + if (RTEST(tags)) { + rb_encoding *encoding = rb_utf8_encoding(); + Check_Type(tags, T_ARRAY); + len = RARRAY_LEN(tags); + if (len > 0) { + long i; + tag_storage = xcalloc((size_t)len, sizeof(struct fy_tag)); + tag_ptrs = xcalloc((size_t)len + 1, sizeof(struct fy_tag *)); + exported = xcalloc((size_t)len * 2, sizeof(VALUE)); + for (i = 0; i < len; i++) { + VALUE tuple = RARRAY_AREF(tags, i); + VALUE name, value; + Check_Type(tuple, T_ARRAY); + if (RARRAY_LEN(tuple) < 2) { + xfree(tag_storage); xfree(tag_ptrs); xfree(exported); + rb_raise(rb_eRuntimeError, "tag tuple must be of length 2"); + } + name = RARRAY_AREF(tuple, 0); + value = RARRAY_AREF(tuple, 1); + StringValue(name); + StringValue(value); + name = rb_str_export_to_enc(name, encoding); + value = rb_str_export_to_enc(value, encoding); + exported[i * 2] = name; + exported[i * 2 + 1] = value; + tag_storage[i].handle = StringValueCStr(name); + tag_storage[i].prefix = StringValueCStr(value); + tag_ptrs[i] = &tag_storage[i]; + } + tag_ptrs[len] = NULL; + } + } + + struct fy_event *event = fy_emit_event_create(e->emit, FYET_DOCUMENT_START, + imp ? 1 : 0, verp, tag_ptrs); + + if (exported) { (void)exported[0]; } + do_emit(e, event); + + if (tag_storage) xfree(tag_storage); + if (tag_ptrs) xfree(tag_ptrs); + if (exported) xfree(exported); + + return self; +} + +static VALUE end_document(VALUE self, VALUE imp) +{ + psych_fy_emitter_t *e; + TypedData_Get_Struct(self, psych_fy_emitter_t, &psych_emitter_type, e); + + do_emit(e, fy_emit_event_create(e->emit, FYET_DOCUMENT_END, imp ? 1 : 0)); + return self; +} + +static enum fy_scalar_style psych_to_fyss(int style, int quoted) +{ + switch (style) { + case 1: return FYSS_PLAIN; + case 2: return FYSS_SINGLE_QUOTED; + case 3: return FYSS_DOUBLE_QUOTED; + case 4: return FYSS_LITERAL; + case 5: return FYSS_FOLDED; + default: + /* style ANY: honour psych's quoted hint so number-like strings are + * not silently re-typed on reload. */ + return quoted ? FYSS_DOUBLE_QUOTED : FYSS_ANY; + } +} + +static enum fy_node_style psych_to_fyns(int style) +{ + switch (style) { + case 1: return FYNS_BLOCK; + case 2: return FYNS_FLOW; + default: return FYNS_ANY; + } +} + +static VALUE scalar(VALUE self, VALUE value, VALUE anchor, VALUE tag, + VALUE plain, VALUE quoted, VALUE style) +{ + psych_fy_emitter_t *e; + rb_encoding *encoding = rb_utf8_encoding(); + + TypedData_Get_Struct(self, psych_fy_emitter_t, &psych_emitter_type, e); + Check_Type(value, T_STRING); + + value = rb_str_export_to_enc(value, encoding); + if (!NIL_P(anchor)) anchor = rb_str_export_to_enc(anchor, encoding); + if (!NIL_P(tag)) tag = rb_str_export_to_enc(tag, encoding); + + enum fy_scalar_style fyss = psych_to_fyss(NUM2INT(style), RTEST(quoted)); + + struct fy_event *event = fy_emit_event_create(e->emit, FYET_SCALAR, + fyss, + RSTRING_PTR(value), (size_t)RSTRING_LEN(value), + NIL_P(anchor) ? NULL : StringValueCStr(anchor), + NIL_P(tag) ? NULL : StringValueCStr(tag)); + + do_emit(e, event); + RB_GC_GUARD(value); + return self; +} + +static VALUE start_sequence(VALUE self, VALUE anchor, VALUE tag, + VALUE implicit, VALUE style) +{ + psych_fy_emitter_t *e; + rb_encoding *encoding = rb_utf8_encoding(); + + TypedData_Get_Struct(self, psych_fy_emitter_t, &psych_emitter_type, e); + + if (!NIL_P(anchor)) anchor = rb_str_export_to_enc(anchor, encoding); + if (!NIL_P(tag)) tag = rb_str_export_to_enc(tag, encoding); + + struct fy_event *event = fy_emit_event_create(e->emit, FYET_SEQUENCE_START, + psych_to_fyns(NUM2INT(style)), + NIL_P(anchor) ? NULL : StringValueCStr(anchor), + NIL_P(tag) ? NULL : StringValueCStr(tag)); + + do_emit(e, event); + return self; +} + +static VALUE end_sequence(VALUE self) +{ + psych_fy_emitter_t *e; + TypedData_Get_Struct(self, psych_fy_emitter_t, &psych_emitter_type, e); + + do_emit(e, fy_emit_event_create(e->emit, FYET_SEQUENCE_END)); + return self; +} + +static VALUE start_mapping(VALUE self, VALUE anchor, VALUE tag, + VALUE implicit, VALUE style) +{ + psych_fy_emitter_t *e; + rb_encoding *encoding = rb_utf8_encoding(); + + TypedData_Get_Struct(self, psych_fy_emitter_t, &psych_emitter_type, e); + + if (!NIL_P(anchor)) anchor = rb_str_export_to_enc(anchor, encoding); + if (!NIL_P(tag)) tag = rb_str_export_to_enc(tag, encoding); + + struct fy_event *event = fy_emit_event_create(e->emit, FYET_MAPPING_START, + psych_to_fyns(NUM2INT(style)), + NIL_P(anchor) ? NULL : StringValueCStr(anchor), + NIL_P(tag) ? NULL : StringValueCStr(tag)); + + do_emit(e, event); + return self; +} + +static VALUE end_mapping(VALUE self) +{ + psych_fy_emitter_t *e; + TypedData_Get_Struct(self, psych_fy_emitter_t, &psych_emitter_type, e); + + do_emit(e, fy_emit_event_create(e->emit, FYET_MAPPING_END)); + return self; +} + +static VALUE alias(VALUE self, VALUE anchor) +{ + psych_fy_emitter_t *e; + TypedData_Get_Struct(self, psych_fy_emitter_t, &psych_emitter_type, e); + + if (!NIL_P(anchor)) anchor = rb_str_export_to_enc(anchor, rb_utf8_encoding()); + + do_emit(e, fy_emit_event_create(e->emit, FYET_ALIAS, + NIL_P(anchor) ? NULL : StringValueCStr(anchor))); + return self; +} + +static VALUE set_canonical(VALUE self, VALUE style) +{ + psych_fy_emitter_t *e; + TypedData_Get_Struct(self, psych_fy_emitter_t, &psych_emitter_type, e); + e->canonical = (Qtrue == style) ? 1 : 0; + rebuild_emitter(self, e); + return style; +} + +static VALUE canonical(VALUE self) +{ + psych_fy_emitter_t *e; + TypedData_Get_Struct(self, psych_fy_emitter_t, &psych_emitter_type, e); + return e->canonical ? Qtrue : Qfalse; +} + +static VALUE set_indentation(VALUE self, VALUE level) +{ + psych_fy_emitter_t *e; + TypedData_Get_Struct(self, psych_fy_emitter_t, &psych_emitter_type, e); + e->indent = NUM2INT(level); + rebuild_emitter(self, e); + return level; +} + +static VALUE indentation(VALUE self) +{ + psych_fy_emitter_t *e; + TypedData_Get_Struct(self, psych_fy_emitter_t, &psych_emitter_type, e); + return INT2NUM(e->indent); +} + +static VALUE line_width(VALUE self) +{ + psych_fy_emitter_t *e; + TypedData_Get_Struct(self, psych_fy_emitter_t, &psych_emitter_type, e); + return INT2NUM(e->width); +} + +static VALUE set_line_width(VALUE self, VALUE width) +{ + psych_fy_emitter_t *e; + TypedData_Get_Struct(self, psych_fy_emitter_t, &psych_emitter_type, e); + e->width = NUM2INT(width); + rebuild_emitter(self, e); + return width; +} + +void Init_psych_emitter(void) +{ +#undef rb_intern + VALUE psych = rb_define_module("Psych"); + VALUE handler = rb_define_class_under(psych, "Handler", rb_cObject); + cPsychEmitter = rb_define_class_under(psych, "Emitter", handler); + + rb_define_alloc_func(cPsychEmitter, allocate); + + rb_define_method(cPsychEmitter, "initialize", initialize, -1); + rb_define_method(cPsychEmitter, "start_stream", start_stream, 1); + rb_define_method(cPsychEmitter, "end_stream", end_stream, 0); + rb_define_method(cPsychEmitter, "start_document", start_document, 3); + rb_define_method(cPsychEmitter, "end_document", end_document, 1); + rb_define_method(cPsychEmitter, "scalar", scalar, 6); + rb_define_method(cPsychEmitter, "start_sequence", start_sequence, 4); + rb_define_method(cPsychEmitter, "end_sequence", end_sequence, 0); + rb_define_method(cPsychEmitter, "start_mapping", start_mapping, 4); + rb_define_method(cPsychEmitter, "end_mapping", end_mapping, 0); + rb_define_method(cPsychEmitter, "alias", alias, 1); + rb_define_method(cPsychEmitter, "canonical", canonical, 0); + rb_define_method(cPsychEmitter, "canonical=", set_canonical, 1); + rb_define_method(cPsychEmitter, "indentation", indentation, 0); + rb_define_method(cPsychEmitter, "indentation=", set_indentation, 1); + rb_define_method(cPsychEmitter, "line_width", line_width, 0); + rb_define_method(cPsychEmitter, "line_width=", set_line_width, 1); + + id_io = rb_intern("io"); + id_write = rb_intern("write"); + id_line_width = rb_intern("line_width"); + id_indentation = rb_intern("indentation"); + id_canonical = rb_intern("canonical"); +} + +#endif /* PSYCH_USE_LIBFYAML */ diff --git a/ext/psych/psych_parser.c b/ext/psych/psych_parser.c index 05a8fa9e..27292737 100644 --- a/ext/psych/psych_parser.c +++ b/ext/psych/psych_parser.c @@ -1,5 +1,7 @@ #include +#ifndef PSYCH_USE_LIBFYAML + VALUE cPsychParser; static ID id_read; @@ -571,3 +573,5 @@ void Init_psych_parser(void) id_end_mapping = rb_intern("end_mapping"); id_event_location = rb_intern("event_location"); } + +#endif /* PSYCH_USE_LIBFYAML */ diff --git a/ext/psych/psych_parser_fy.c b/ext/psych/psych_parser_fy.c new file mode 100644 index 00000000..fe03b818 --- /dev/null +++ b/ext/psych/psych_parser_fy.c @@ -0,0 +1,476 @@ +#include + +#ifdef PSYCH_USE_LIBFYAML +/* + * Experimental libfyaml-backed parser. Only compiled when psych is built + * with --enable-libfyaml. Mirrors the event protocol of the libyaml backend + * in ext/psych/psych_parser.c so the Ruby layer is unchanged. + */ + +VALUE cPsychParser; + +static ID id_read; +static ID id_empty; +static ID id_start_stream; +static ID id_end_stream; +static ID id_start_document; +static ID id_end_document; +static ID id_alias; +static ID id_scalar; +static ID id_start_sequence; +static ID id_end_sequence; +static ID id_start_mapping; +static ID id_end_mapping; +static ID id_event_location; + +#define PSYCH_TRANSCODE(_str, _yaml_enc, _internal_enc) \ + do { \ + rb_enc_associate_index((_str), (_yaml_enc)); \ + if(_internal_enc) \ + (_str) = rb_str_export_to_enc((_str), (_internal_enc)); \ + } while (0) + +/* libyaml-compatible encoding constants exposed to the Ruby layer. */ +#define PSYCH_ANY_ENCODING 0 +#define PSYCH_UTF8_ENCODING 1 +#define PSYCH_UTF16LE_ENCODING 2 +#define PSYCH_UTF16BE_ENCODING 3 + +typedef struct { + struct fy_parser *fyp; + size_t mark_line; + size_t mark_column; + size_t mark_index; +} psych_fy_parser_t; + +static ssize_t io_reader(void *user, void *buf, size_t count) +{ + VALUE io = (VALUE)user; + VALUE string = rb_funcall(io, id_read, 1, SIZET2NUM(count)); + + if (NIL_P(string)) { + return 0; /* EOF */ + } + + StringValue(string); + size_t len = (size_t)RSTRING_LEN(string); + if (len > count) { + len = count; + } + memcpy(buf, RSTRING_PTR(string), len); + return (ssize_t)len; +} + +static void dealloc(void *ptr) +{ + psych_fy_parser_t *parser = (psych_fy_parser_t *)ptr; + if (parser->fyp) { + fy_parser_destroy(parser->fyp); + } + xfree(parser); +} + +static const rb_data_type_t psych_parser_type = { + "Psych/parser", + {0, dealloc, 0,}, + 0, 0, +#ifdef RUBY_TYPED_FREE_IMMEDIATELY + RUBY_TYPED_FREE_IMMEDIATELY, +#endif +}; + +static VALUE allocate(VALUE klass) +{ + psych_fy_parser_t *parser; + VALUE obj = TypedData_Make_Struct(klass, psych_fy_parser_t, &psych_parser_type, parser); + + static const struct fy_parse_cfg cfg = { + .flags = FYPCF_QUIET | FYPCF_COLLECT_DIAG | FYPCF_DEFAULT_VERSION_AUTO, + }; + parser->fyp = fy_parser_create(&cfg); + if (!parser->fyp) { + rb_raise(rb_eNoMemError, "could not create libfyaml parser"); + } + + return obj; +} + +/* TODO: libfyaml's diagnostics are collected via fy_diag; reconstructing the + * libyaml-style problem/context/offset is left for a later pass. For now we + * raise a Psych::SyntaxError with the best-effort mark we tracked. */ +static VALUE make_exception(psych_fy_parser_t *parser, VALUE path) +{ + VALUE ePsychSyntaxError = rb_const_get(mPsych, rb_intern("SyntaxError")); + + return rb_funcall(ePsychSyntaxError, rb_intern("new"), 6, + path, + SIZET2NUM(parser->mark_line + 1), + SIZET2NUM(parser->mark_column + 1), + SIZET2NUM(parser->mark_index), + rb_usascii_str_new2("could not parse YAML"), + Qnil); +} + +static VALUE transcode_string(VALUE src) +{ + int utf8 = rb_utf8_encindex(); + int source_encoding = rb_enc_get_index(src); + + if (source_encoding == utf8 || source_encoding == rb_usascii_encindex()) { + return src; + } + + src = rb_str_export_to_enc(src, rb_utf8_encoding()); + return src; +} + +/* ---- protected handler trampolines (identical protocol to libyaml backend) */ + +static VALUE protected_start_stream(VALUE pointer) +{ + VALUE *args = (VALUE *)pointer; + return rb_funcall(args[0], id_start_stream, 1, args[1]); +} + +static VALUE protected_start_document(VALUE pointer) +{ + VALUE *args = (VALUE *)pointer; + return rb_funcall3(args[0], id_start_document, 3, args + 1); +} + +static VALUE protected_end_document(VALUE pointer) +{ + VALUE *args = (VALUE *)pointer; + return rb_funcall(args[0], id_end_document, 1, args[1]); +} + +static VALUE protected_alias(VALUE pointer) +{ + VALUE *args = (VALUE *)pointer; + return rb_funcall(args[0], id_alias, 1, args[1]); +} + +static VALUE protected_scalar(VALUE pointer) +{ + VALUE *args = (VALUE *)pointer; + return rb_funcall3(args[0], id_scalar, 6, args + 1); +} + +static VALUE protected_start_sequence(VALUE pointer) +{ + VALUE *args = (VALUE *)pointer; + return rb_funcall3(args[0], id_start_sequence, 4, args + 1); +} + +static VALUE protected_end_sequence(VALUE handler) +{ + return rb_funcall(handler, id_end_sequence, 0); +} + +static VALUE protected_start_mapping(VALUE pointer) +{ + VALUE *args = (VALUE *)pointer; + return rb_funcall3(args[0], id_start_mapping, 4, args + 1); +} + +static VALUE protected_end_mapping(VALUE handler) +{ + return rb_funcall(handler, id_end_mapping, 0); +} + +static VALUE protected_empty(VALUE handler) +{ + return rb_funcall(handler, id_empty, 0); +} + +static VALUE protected_end_stream(VALUE handler) +{ + return rb_funcall(handler, id_end_stream, 0); +} + +static VALUE protected_event_location(VALUE pointer) +{ + VALUE *args = (VALUE *)pointer; + return rb_funcall3(args[0], id_event_location, 4, args + 1); +} + +/* ---- enum translation: libfyaml -> psych/libyaml integer constants -------- */ + +static int fyss_to_psych(enum fy_scalar_style s) +{ + switch (s) { + case FYSS_PLAIN: return 1; + case FYSS_SINGLE_QUOTED: return 2; + case FYSS_DOUBLE_QUOTED: return 3; + case FYSS_LITERAL: return 4; + case FYSS_FOLDED: return 5; + default: return 0; /* FYSS_ANY */ + } +} + +static int fyns_to_psych(enum fy_node_style s) +{ + switch (s) { + case FYNS_FLOW: return 2; + case FYNS_BLOCK: return 1; + default: return 0; /* FYNS_ANY */ + } +} + +static VALUE token_to_str(struct fy_token *tok, int encoding, rb_encoding *internal_enc) +{ + size_t len = 0; + const char *text; + + if (!tok) { + return Qnil; + } + text = fy_token_get_text(tok, &len); + if (!text) { + return Qnil; + } + VALUE str = rb_str_new(text, (long)len); + PSYCH_TRANSCODE(str, encoding, internal_enc); + return str; +} + +static VALUE parse(VALUE self, VALUE handler, VALUE yaml, VALUE path) +{ + psych_fy_parser_t *parser; + struct fy_event *event; + int done = 0; + int state = 0; + int encoding = rb_utf8_encindex(); + rb_encoding *internal_enc = rb_default_internal_encoding(); + + TypedData_Get_Struct(self, psych_fy_parser_t, &psych_parser_type, parser); + + fy_parser_reset(parser->fyp); + parser->mark_line = parser->mark_column = parser->mark_index = 0; + + if (rb_respond_to(yaml, id_read)) { + if (fy_parser_set_input_callback(parser->fyp, (void *)yaml, io_reader) != 0) { + rb_raise(rb_eRuntimeError, "could not set libfyaml input"); + } + } else { + StringValue(yaml); + yaml = transcode_string(yaml); + if (fy_parser_set_string(parser->fyp, + RSTRING_PTR(yaml), (size_t)RSTRING_LEN(yaml)) != 0) { + rb_raise(rb_eRuntimeError, "could not set libfyaml input"); + } + } + + while (!done) { + VALUE event_args[5]; + const struct fy_mark *sm, *em; + + event = fy_parser_parse(parser->fyp); + + if (!event) { + VALUE exception = make_exception(parser, path); + rb_exc_raise(exception); + } + + sm = fy_event_start_mark(event); + em = fy_event_end_mark(event); + if (sm) { + parser->mark_line = (size_t)sm->line; + parser->mark_column = (size_t)sm->column; + parser->mark_index = sm->input_pos; + } + + event_args[0] = handler; + event_args[1] = SIZET2NUM(sm ? (size_t)sm->line : 0); + event_args[2] = SIZET2NUM(sm ? (size_t)sm->column : 0); + event_args[3] = SIZET2NUM(em ? (size_t)em->line : 0); + event_args[4] = SIZET2NUM(em ? (size_t)em->column : 0); + rb_protect(protected_event_location, (VALUE)event_args, &state); + + switch (event->type) { + case FYET_STREAM_START: + { + VALUE args[2]; + args[0] = handler; + args[1] = INT2NUM(PSYCH_UTF8_ENCODING); + rb_protect(protected_start_stream, (VALUE)args, &state); + } + break; + case FYET_DOCUMENT_START: + { + VALUE args[4]; + VALUE version = rb_ary_new(); + VALUE tag_directives = rb_ary_new(); + struct fy_document_state *ds = event->document_start.document_state; + + if (ds && fy_document_state_version_explicit(ds)) { + const struct fy_version *v = fy_document_state_version(ds); + if (v) { + version = rb_ary_new3((long)2, + INT2NUM(v->major), INT2NUM(v->minor)); + } + } + + if (ds && fy_document_state_tags_explicit(ds)) { + void *iter = NULL; + const struct fy_tag *tag; + while ((tag = fy_document_state_tag_directive_iterate(ds, &iter)) != NULL) { + /* skip the implicit defaults ("!" and "!!") */ + if (tag->handle && tag->prefix) { + if ((strcmp(tag->handle, "!") == 0 && strcmp(tag->prefix, "!") == 0) || + (strcmp(tag->handle, "!!") == 0 && + strcmp(tag->prefix, "tag:yaml.org,2002:") == 0)) { + continue; + } + } + VALUE handle = tag->handle ? rb_str_new2(tag->handle) : Qnil; + VALUE prefix = tag->prefix ? rb_str_new2(tag->prefix) : Qnil; + if (!NIL_P(handle)) PSYCH_TRANSCODE(handle, encoding, internal_enc); + if (!NIL_P(prefix)) PSYCH_TRANSCODE(prefix, encoding, internal_enc); + rb_ary_push(tag_directives, rb_ary_new3((long)2, handle, prefix)); + } + } + + args[0] = handler; + args[1] = version; + args[2] = tag_directives; + args[3] = event->document_start.implicit ? Qtrue : Qfalse; + rb_protect(protected_start_document, (VALUE)args, &state); + } + break; + case FYET_DOCUMENT_END: + { + VALUE args[2]; + args[0] = handler; + args[1] = event->document_end.implicit ? Qtrue : Qfalse; + rb_protect(protected_end_document, (VALUE)args, &state); + } + break; + case FYET_ALIAS: + { + VALUE args[2]; + args[0] = handler; + args[1] = token_to_str(event->alias.anchor, encoding, internal_enc); + rb_protect(protected_alias, (VALUE)args, &state); + } + break; + case FYET_SCALAR: + { + VALUE args[7]; + enum fy_scalar_style fyss = fy_token_scalar_style(event->scalar.value); + int has_tag = (event->scalar.tag != NULL); + int plain_style = (fyss == FYSS_PLAIN); + + args[0] = handler; + args[1] = token_to_str(event->scalar.value, encoding, internal_enc); + if (NIL_P(args[1])) args[1] = rb_str_new2(""); + args[2] = token_to_str(event->scalar.anchor, encoding, internal_enc); + args[3] = token_to_str(event->scalar.tag, encoding, internal_enc); + /* libfyaml does not expose libyaml's plain_implicit / + * quoted_implicit pair, so reconstruct them from the explicit + * tag presence and the scalar style, matching libyaml: + * plain, untagged -> (plain=1, quoted=0) + * quoted, untagged -> (plain=0, quoted=1) + * tagged -> (plain=0, quoted=0) */ + args[4] = (!has_tag && plain_style) ? Qtrue : Qfalse; + args[5] = (!has_tag && !plain_style) ? Qtrue : Qfalse; + args[6] = INT2NUM(fyss_to_psych(fyss)); + rb_protect(protected_scalar, (VALUE)args, &state); + } + break; + case FYET_SEQUENCE_START: + { + VALUE args[5]; + args[0] = handler; + args[1] = token_to_str(event->sequence_start.anchor, encoding, internal_enc); + args[2] = token_to_str(event->sequence_start.tag, encoding, internal_enc); + args[3] = event->sequence_start.tag ? Qfalse : Qtrue; + args[4] = INT2NUM(fyns_to_psych(fy_event_get_node_style(event))); + rb_protect(protected_start_sequence, (VALUE)args, &state); + } + break; + case FYET_SEQUENCE_END: + rb_protect(protected_end_sequence, handler, &state); + break; + case FYET_MAPPING_START: + { + VALUE args[5]; + args[0] = handler; + args[1] = token_to_str(event->mapping_start.anchor, encoding, internal_enc); + args[2] = token_to_str(event->mapping_start.tag, encoding, internal_enc); + args[3] = event->mapping_start.tag ? Qfalse : Qtrue; + args[4] = INT2NUM(fyns_to_psych(fy_event_get_node_style(event))); + rb_protect(protected_start_mapping, (VALUE)args, &state); + } + break; + case FYET_MAPPING_END: + rb_protect(protected_end_mapping, handler, &state); + break; + case FYET_NONE: + rb_protect(protected_empty, handler, &state); + break; + case FYET_STREAM_END: + rb_protect(protected_end_stream, handler, &state); + done = 1; + break; + } + + fy_parser_event_free(parser->fyp, event); + if (state) rb_jump_tag(state); + } + + RB_GC_GUARD(yaml); + return self; +} + +/* + * call-seq: + * parser.mark # => # + */ +static VALUE mark(VALUE self) +{ + VALUE mark_klass; + VALUE args[3]; + psych_fy_parser_t *parser; + + TypedData_Get_Struct(self, psych_fy_parser_t, &psych_parser_type, parser); + mark_klass = rb_const_get_at(cPsychParser, rb_intern("Mark")); + args[0] = SIZET2NUM(parser->mark_index); + args[1] = SIZET2NUM(parser->mark_line); + args[2] = SIZET2NUM(parser->mark_column); + + return rb_class_new_instance(3, args, mark_klass); +} + +void Init_psych_parser(void) +{ +#undef rb_intern + cPsychParser = rb_define_class_under(mPsych, "Parser", rb_cObject); + rb_define_alloc_func(cPsychParser, allocate); + + rb_define_const(cPsychParser, "ANY", INT2NUM(PSYCH_ANY_ENCODING)); + rb_define_const(cPsychParser, "UTF8", INT2NUM(PSYCH_UTF8_ENCODING)); + rb_define_const(cPsychParser, "UTF16LE", INT2NUM(PSYCH_UTF16LE_ENCODING)); + rb_define_const(cPsychParser, "UTF16BE", INT2NUM(PSYCH_UTF16BE_ENCODING)); + + rb_require("psych/syntax_error"); + + rb_define_private_method(cPsychParser, "_native_parse", parse, 3); + rb_define_method(cPsychParser, "mark", mark, 0); + + id_read = rb_intern("read"); + id_empty = rb_intern("empty"); + id_start_stream = rb_intern("start_stream"); + id_end_stream = rb_intern("end_stream"); + id_start_document = rb_intern("start_document"); + id_end_document = rb_intern("end_document"); + id_alias = rb_intern("alias"); + id_scalar = rb_intern("scalar"); + id_start_sequence = rb_intern("start_sequence"); + id_end_sequence = rb_intern("end_sequence"); + id_start_mapping = rb_intern("start_mapping"); + id_end_mapping = rb_intern("end_mapping"); + id_event_location = rb_intern("event_location"); +} + +#endif /* PSYCH_USE_LIBFYAML */ From 4c57ca5eeacd40d88a1858a21f9dbe1c3627ba2b Mon Sep 17 00:00:00 2001 From: Hiroshi SHIBATA Date: Wed, 1 Jul 2026 07:00:30 +0900 Subject: [PATCH 02/16] Resolve booleans per YAML 1.2 on the libfyaml backend Scalar type resolution happens in ScalarScanner, not the C backend, so swapping to libfyaml alone still resolved yes/no/on/off to booleans. Key the boolean set on Psych::BACKEND so the libyaml default keeps the YAML 1.1 set while the experimental libfyaml backend follows 1.2. Co-Authored-By: Claude Opus 4.8 --- lib/psych/scalar_scanner.rb | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/lib/psych/scalar_scanner.rb b/lib/psych/scalar_scanner.rb index d744e611..68e17ecf 100644 --- a/lib/psych/scalar_scanner.rb +++ b/lib/psych/scalar_scanner.rb @@ -24,6 +24,18 @@ class ScalarScanner |[-+]?(?:0|[1-9](?:[0-9]|,[0-9]|_[0-9])*) (?# base 10) |[-+]?0x[_,]*[0-9a-fA-F][0-9a-fA-F_,]* (?# base 16))$/x + # YAML 1.1 treats yes/no/on/off as booleans in addition to true/false, + # while YAML 1.2's core schema only recognizes true/false. The default + # libyaml backend keeps the 1.1 set for backward compatibility; the + # experimental libfyaml backend follows 1.2. + if defined?(Psych::BACKEND) && Psych::BACKEND == 'libfyaml' + BOOLEAN_TRUE = /^true$/i + BOOLEAN_FALSE = /^false$/i + else + BOOLEAN_TRUE = /^(yes|true|on)$/i + BOOLEAN_FALSE = /^(no|false|off)$/i + end + attr_reader :class_loader # Create a new scanner @@ -48,9 +60,9 @@ def tokenize string string elsif string == '~' || string.match?(/^null$/i) nil - elsif string.match?(/^(yes|true|on)$/i) + elsif string.match?(BOOLEAN_TRUE) true - elsif string.match?(/^(no|false|off)$/i) + elsif string.match?(BOOLEAN_FALSE) false else string From 0486109a87a298951e158e8af87e19e1b044e14c Mon Sep 17 00:00:00 2001 From: Hiroshi SHIBATA Date: Wed, 1 Jul 2026 07:22:24 +0900 Subject: [PATCH 03/16] Improve libfyaml parser fidelity and error reporting Create a fresh parser per parse instead of reusing one via fy_parser_reset(), which left default tag handles unset and rejected bare ("---"-less) tag-led documents. Recover the real message and position by switching the parser's own diagnostic object to collect mode; creating a replacement diag crashes libfyaml 0.9.6. Drop the spurious empty tag directive libfyaml reports. Co-Authored-By: Claude Opus 4.8 --- ext/psych/psych_parser_fy.c | 76 +++++++++++++++++++++++++++++-------- 1 file changed, 60 insertions(+), 16 deletions(-) diff --git a/ext/psych/psych_parser_fy.c b/ext/psych/psych_parser_fy.c index fe03b818..8c297c15 100644 --- a/ext/psych/psych_parser_fy.c +++ b/ext/psych/psych_parser_fy.c @@ -43,6 +43,14 @@ typedef struct { size_t mark_index; } psych_fy_parser_t; +static const struct fy_parse_cfg psych_parse_cfg = { + /* Keep libfyaml's strict YAML 1.2 flow-indentation checks. This backend + * exists to follow the 1.2 spec, so we reject malformed flow indentation + * (e.g. wrongly indented flow sequences) rather than relaxing to libyaml's + * 1.1-era leniency with FYPCF_SLOPPY_FLOW_INDENTATION. */ + .flags = FYPCF_QUIET | FYPCF_DEFAULT_VERSION_AUTO, +}; + static ssize_t io_reader(void *user, void *buf, size_t count) { VALUE io = (VALUE)user; @@ -84,30 +92,40 @@ static VALUE allocate(VALUE klass) psych_fy_parser_t *parser; VALUE obj = TypedData_Make_Struct(klass, psych_fy_parser_t, &psych_parser_type, parser); - static const struct fy_parse_cfg cfg = { - .flags = FYPCF_QUIET | FYPCF_COLLECT_DIAG | FYPCF_DEFAULT_VERSION_AUTO, - }; - parser->fyp = fy_parser_create(&cfg); - if (!parser->fyp) { - rb_raise(rb_eNoMemError, "could not create libfyaml parser"); - } + parser->fyp = NULL; return obj; } -/* TODO: libfyaml's diagnostics are collected via fy_diag; reconstructing the - * libyaml-style problem/context/offset is left for a later pass. For now we - * raise a Psych::SyntaxError with the best-effort mark we tracked. */ +/* Reconstruct a Psych::SyntaxError from libfyaml's collected diagnostics. The + * parser is created with FYPCF_COLLECT_DIAG, so the first collected error gives + * us the message and position. */ static VALUE make_exception(psych_fy_parser_t *parser, VALUE path) { VALUE ePsychSyntaxError = rb_const_get(mPsych, rb_intern("SyntaxError")); + VALUE problem = Qnil; + size_t line = parser->mark_line; + size_t column = parser->mark_column; + + struct fy_diag *diag = fy_parser_get_diag(parser->fyp); + if (diag) { + void *iter = NULL; + struct fy_diag_error *err = fy_diag_errors_iterate(diag, &iter); + if (err) { + if (err->msg) problem = rb_usascii_str_new2(err->msg); + if (err->line >= 0) line = (size_t)err->line; + if (err->column >= 0) column = (size_t)err->column; + } + fy_diag_unref(diag); + } + if (NIL_P(problem)) problem = rb_usascii_str_new2("could not parse YAML"); return rb_funcall(ePsychSyntaxError, rb_intern("new"), 6, path, - SIZET2NUM(parser->mark_line + 1), - SIZET2NUM(parser->mark_column + 1), + SIZET2NUM(line), + SIZET2NUM(column), SIZET2NUM(parser->mark_index), - rb_usascii_str_new2("could not parse YAML"), + problem, Qnil); } @@ -245,9 +263,31 @@ static VALUE parse(VALUE self, VALUE handler, VALUE yaml, VALUE path) TypedData_Get_Struct(self, psych_fy_parser_t, &psych_parser_type, parser); - fy_parser_reset(parser->fyp); + /* Use a pristine parser for each parse, like fy-tool does. Reusing a + * parser across documents via fy_parser_reset() left the default tag + * handles unset for bare (no "---") tag-led documents. */ + if (parser->fyp) { + fy_parser_destroy(parser->fyp); + parser->fyp = NULL; + } + parser->fyp = fy_parser_create(&psych_parse_cfg); + if (!parser->fyp) { + rb_raise(rb_eNoMemError, "could not create libfyaml parser"); + } parser->mark_line = parser->mark_column = parser->mark_index = 0; + /* Make the parser's own diagnostic object collect errors instead of + * printing them to stderr, so make_exception() can recover the message. + * Replacing the diag with a freshly created one crashes libfyaml 0.9.6, + * so mutate the existing default diag in place. */ + { + struct fy_diag *diag = fy_parser_get_diag(parser->fyp); + if (diag) { + fy_diag_set_collect_errors(diag, true); + fy_diag_unref(diag); + } + } + if (rb_respond_to(yaml, id_read)) { if (fy_parser_set_input_callback(parser->fyp, (void *)yaml, io_reader) != 0) { rb_raise(rb_eRuntimeError, "could not set libfyaml input"); @@ -315,8 +355,12 @@ static VALUE parse(VALUE self, VALUE handler, VALUE yaml, VALUE path) void *iter = NULL; const struct fy_tag *tag; while ((tag = fy_document_state_tag_directive_iterate(ds, &iter)) != NULL) { - /* skip the implicit defaults ("!" and "!!") */ - if (tag->handle && tag->prefix) { + /* skip the implicit defaults ("!", "!!" and the empty + * primary handle libfyaml reports) */ + if (!tag->handle || tag->handle[0] == '\0') { + continue; + } + if (tag->prefix) { if ((strcmp(tag->handle, "!") == 0 && strcmp(tag->prefix, "!") == 0) || (strcmp(tag->handle, "!!") == 0 && strcmp(tag->prefix, "tag:yaml.org,2002:") == 0)) { From 86ff1f441d428dd10704623162d5946c71ac32fd Mon Sep 17 00:00:00 2001 From: Hiroshi SHIBATA Date: Wed, 1 Jul 2026 07:28:14 +0900 Subject: [PATCH 04/16] Match libyaml scalar emission on the libfyaml backend Drop the tag when plain_implicit or quoted_implicit is set, matching how libyaml omits a tag that the value resolves to on reload; otherwise nil emitted as "!" instead of an empty scalar. Honor the plain hint when choosing the scalar style, and restore the Check_Type guards on anchor and tag so non-string arguments raise TypeError. Co-Authored-By: Claude Opus 4.8 --- ext/psych/psych_emitter_fy.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/ext/psych/psych_emitter_fy.c b/ext/psych/psych_emitter_fy.c index 82679200..ff0d7176 100644 --- a/ext/psych/psych_emitter_fy.c +++ b/ext/psych/psych_emitter_fy.c @@ -224,7 +224,7 @@ static VALUE end_document(VALUE self, VALUE imp) return self; } -static enum fy_scalar_style psych_to_fyss(int style, int quoted) +static enum fy_scalar_style psych_to_fyss(int style, int plain, int quoted) { switch (style) { case 1: return FYSS_PLAIN; @@ -233,9 +233,13 @@ static enum fy_scalar_style psych_to_fyss(int style, int quoted) case 4: return FYSS_LITERAL; case 5: return FYSS_FOLDED; default: - /* style ANY: honour psych's quoted hint so number-like strings are - * not silently re-typed on reload. */ - return quoted ? FYSS_DOUBLE_QUOTED : FYSS_ANY; + /* style ANY: honour psych's plain/quoted hints. Forcing a plain + * scalar plain keeps libfyaml from tagging empty scalars (nil) as + * explicit nulls; the quoted hint keeps number-like strings from + * being re-typed on reload. */ + if (quoted) return FYSS_DOUBLE_QUOTED; + if (plain) return FYSS_PLAIN; + return FYSS_ANY; } } @@ -258,16 +262,22 @@ static VALUE scalar(VALUE self, VALUE value, VALUE anchor, VALUE tag, Check_Type(value, T_STRING); value = rb_str_export_to_enc(value, encoding); - if (!NIL_P(anchor)) anchor = rb_str_export_to_enc(anchor, encoding); - if (!NIL_P(tag)) tag = rb_str_export_to_enc(tag, encoding); + if (!NIL_P(anchor)) { Check_Type(anchor, T_STRING); anchor = rb_str_export_to_enc(anchor, encoding); } + if (!NIL_P(tag)) { Check_Type(tag, T_STRING); tag = rb_str_export_to_enc(tag, encoding); } - enum fy_scalar_style fyss = psych_to_fyss(NUM2INT(style), RTEST(quoted)); + enum fy_scalar_style fyss = psych_to_fyss(NUM2INT(style), RTEST(plain), RTEST(quoted)); + + /* libyaml omits the tag when plain_implicit (or quoted_implicit) is set, + * since the value resolves to that tag on reload. fy_emit_event_create() + * has no implicit flag and would always print the tag (e.g. nil as + * "!"), so drop it here to match. */ + int emit_tag = !NIL_P(tag) && !RTEST(plain) && !RTEST(quoted); struct fy_event *event = fy_emit_event_create(e->emit, FYET_SCALAR, fyss, RSTRING_PTR(value), (size_t)RSTRING_LEN(value), NIL_P(anchor) ? NULL : StringValueCStr(anchor), - NIL_P(tag) ? NULL : StringValueCStr(tag)); + emit_tag ? StringValueCStr(tag) : NULL); do_emit(e, event); RB_GC_GUARD(value); @@ -282,8 +292,8 @@ static VALUE start_sequence(VALUE self, VALUE anchor, VALUE tag, TypedData_Get_Struct(self, psych_fy_emitter_t, &psych_emitter_type, e); - if (!NIL_P(anchor)) anchor = rb_str_export_to_enc(anchor, encoding); - if (!NIL_P(tag)) tag = rb_str_export_to_enc(tag, encoding); + if (!NIL_P(anchor)) { Check_Type(anchor, T_STRING); anchor = rb_str_export_to_enc(anchor, encoding); } + if (!NIL_P(tag)) { Check_Type(tag, T_STRING); tag = rb_str_export_to_enc(tag, encoding); } struct fy_event *event = fy_emit_event_create(e->emit, FYET_SEQUENCE_START, psych_to_fyns(NUM2INT(style)), @@ -311,8 +321,8 @@ static VALUE start_mapping(VALUE self, VALUE anchor, VALUE tag, TypedData_Get_Struct(self, psych_fy_emitter_t, &psych_emitter_type, e); - if (!NIL_P(anchor)) anchor = rb_str_export_to_enc(anchor, encoding); - if (!NIL_P(tag)) tag = rb_str_export_to_enc(tag, encoding); + if (!NIL_P(anchor)) { Check_Type(anchor, T_STRING); anchor = rb_str_export_to_enc(anchor, encoding); } + if (!NIL_P(tag)) { Check_Type(tag, T_STRING); tag = rb_str_export_to_enc(tag, encoding); } struct fy_event *event = fy_emit_event_create(e->emit, FYET_MAPPING_START, psych_to_fyns(NUM2INT(style)), @@ -337,7 +347,7 @@ static VALUE alias(VALUE self, VALUE anchor) psych_fy_emitter_t *e; TypedData_Get_Struct(self, psych_fy_emitter_t, &psych_emitter_type, e); - if (!NIL_P(anchor)) anchor = rb_str_export_to_enc(anchor, rb_utf8_encoding()); + if (!NIL_P(anchor)) { Check_Type(anchor, T_STRING); anchor = rb_str_export_to_enc(anchor, rb_utf8_encoding()); } do_emit(e, fy_emit_event_create(e->emit, FYET_ALIAS, NIL_P(anchor) ? NULL : StringValueCStr(anchor))); From e0bcad89b2d74fc0e7211dcb36d18e8cb110528a Mon Sep 17 00:00:00 2001 From: Hiroshi SHIBATA Date: Wed, 1 Jul 2026 08:05:12 +0900 Subject: [PATCH 05/16] Transcode UTF-16 IO input to UTF-8 on the libfyaml backend libfyaml only consumes UTF-8, so a UTF-16 IO fed through the chunked reader reached it as raw bytes and was rejected as invalid UTF-8. When the IO's external encoding is UTF-16LE/BE, slurp the whole stream and transcode it first; a 2-byte unit could otherwise straddle a read boundary. Other non-UTF-8 encodings stay raw and libfyaml rejects them, matching psych's UTF-8/UTF-16-only IO contract (Shift_JIS still raises). Co-Authored-By: Claude Opus 4.8 --- ext/psych/psych_parser_fy.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/ext/psych/psych_parser_fy.c b/ext/psych/psych_parser_fy.c index 8c297c15..6b9d04e3 100644 --- a/ext/psych/psych_parser_fy.c +++ b/ext/psych/psych_parser_fy.c @@ -289,7 +289,25 @@ static VALUE parse(VALUE self, VALUE handler, VALUE yaml, VALUE path) } if (rb_respond_to(yaml, id_read)) { - if (fy_parser_set_input_callback(parser->fyp, (void *)yaml, io_reader) != 0) { + VALUE ext_enc = rb_funcall(yaml, rb_intern("external_encoding"), 0); + int ext_idx = NIL_P(ext_enc) ? -1 : rb_to_encoding_index(ext_enc); + + if (ext_idx == rb_enc_find_index("UTF-16LE") || + ext_idx == rb_enc_find_index("UTF-16BE")) { + /* libfyaml only consumes UTF-8. A UTF-16 stream cannot be fed + * through the chunked reader because a 2-byte unit may straddle a + * read boundary, so slurp the whole stream and transcode it. Any + * other non-UTF-8 external encoding is left raw and libfyaml will + * reject it, matching psych's "UTF-8/UTF-16 only" IO contract. */ + VALUE content = rb_funcall(yaml, id_read, 0); + if (NIL_P(content)) content = rb_str_new("", 0); + StringValue(content); + yaml = transcode_string(content); + if (fy_parser_set_string(parser->fyp, + RSTRING_PTR(yaml), (size_t)RSTRING_LEN(yaml)) != 0) { + rb_raise(rb_eRuntimeError, "could not set libfyaml input"); + } + } else if (fy_parser_set_input_callback(parser->fyp, (void *)yaml, io_reader) != 0) { rb_raise(rb_eRuntimeError, "could not set libfyaml input"); } } else { From 53122b975ba5380d6c9b25d1c479edc9209cde06 Mon Sep 17 00:00:00 2001 From: Hiroshi SHIBATA Date: Wed, 1 Jul 2026 09:02:34 +0900 Subject: [PATCH 06/16] Free tag-directive buffers on the error path in start_document The xcalloc'd tag buffers leaked when StringValue, the tuple-length check, or the emit raised mid-way. Wrap the work in rb_ensure so the buffers are always freed, and keep the exported directive strings in a Ruby array so the GC cannot reclaim them while their C pointers are in use. Co-Authored-By: Claude Opus 4.8 --- ext/psych/psych_emitter_fy.c | 80 +++++++++++++++++++++++++----------- 1 file changed, 56 insertions(+), 24 deletions(-) diff --git a/ext/psych/psych_emitter_fy.c b/ext/psych/psych_emitter_fy.c index ff0d7176..7dd60f60 100644 --- a/ext/psych/psych_emitter_fy.c +++ b/ext/psych/psych_emitter_fy.c @@ -150,17 +150,27 @@ static VALUE end_stream(VALUE self) return self; } -static VALUE start_document(VALUE self, VALUE version, VALUE tags, VALUE imp) +struct start_document_data { + VALUE self; + VALUE version; + VALUE tags; + VALUE imp; + struct fy_tag *tag_storage; + const struct fy_tag **tag_ptrs; +}; + +static VALUE start_document_try(VALUE d) { + struct start_document_data *data = (struct start_document_data *)d; + VALUE version = data->version; + VALUE tags = data->tags; psych_fy_emitter_t *e; struct fy_version ver; const struct fy_version *verp = NULL; - struct fy_tag *tag_storage = NULL; - const struct fy_tag **tag_ptrs = NULL; - VALUE *exported = NULL; - long len = 0; + VALUE guard = Qnil; + struct fy_event *event; - TypedData_Get_Struct(self, psych_fy_emitter_t, &psych_emitter_type, e); + TypedData_Get_Struct(data->self, psych_fy_emitter_t, &psych_emitter_type, e); Check_Type(version, T_ARRAY); if (RARRAY_LEN(version) >= 2) { @@ -171,19 +181,20 @@ static VALUE start_document(VALUE self, VALUE version, VALUE tags, VALUE imp) if (RTEST(tags)) { rb_encoding *encoding = rb_utf8_encoding(); + long i, len; Check_Type(tags, T_ARRAY); len = RARRAY_LEN(tags); if (len > 0) { - long i; - tag_storage = xcalloc((size_t)len, sizeof(struct fy_tag)); - tag_ptrs = xcalloc((size_t)len + 1, sizeof(struct fy_tag *)); - exported = xcalloc((size_t)len * 2, sizeof(VALUE)); + /* Ruby array keeps the exported strings reachable for the GC while + * their C pointers live in tag_storage. */ + guard = rb_ary_new_capa(len * 2); + data->tag_storage = xcalloc((size_t)len, sizeof(struct fy_tag)); + data->tag_ptrs = xcalloc((size_t)len + 1, sizeof(struct fy_tag *)); for (i = 0; i < len; i++) { VALUE tuple = RARRAY_AREF(tags, i); VALUE name, value; Check_Type(tuple, T_ARRAY); if (RARRAY_LEN(tuple) < 2) { - xfree(tag_storage); xfree(tag_ptrs); xfree(exported); rb_raise(rb_eRuntimeError, "tag tuple must be of length 2"); } name = RARRAY_AREF(tuple, 0); @@ -192,27 +203,48 @@ static VALUE start_document(VALUE self, VALUE version, VALUE tags, VALUE imp) StringValue(value); name = rb_str_export_to_enc(name, encoding); value = rb_str_export_to_enc(value, encoding); - exported[i * 2] = name; - exported[i * 2 + 1] = value; - tag_storage[i].handle = StringValueCStr(name); - tag_storage[i].prefix = StringValueCStr(value); - tag_ptrs[i] = &tag_storage[i]; + rb_ary_push(guard, name); + rb_ary_push(guard, value); + data->tag_storage[i].handle = StringValueCStr(name); + data->tag_storage[i].prefix = StringValueCStr(value); + data->tag_ptrs[i] = &data->tag_storage[i]; } - tag_ptrs[len] = NULL; + data->tag_ptrs[len] = NULL; } } - struct fy_event *event = fy_emit_event_create(e->emit, FYET_DOCUMENT_START, - imp ? 1 : 0, verp, tag_ptrs); + event = fy_emit_event_create(e->emit, FYET_DOCUMENT_START, + data->imp ? 1 : 0, verp, data->tag_ptrs); - if (exported) { (void)exported[0]; } do_emit(e, event); + RB_GC_GUARD(guard); - if (tag_storage) xfree(tag_storage); - if (tag_ptrs) xfree(tag_ptrs); - if (exported) xfree(exported); + return data->self; +} - return self; +static VALUE start_document_ensure(VALUE d) +{ + struct start_document_data *data = (struct start_document_data *)d; + + xfree(data->tag_storage); + xfree(data->tag_ptrs); + + return Qnil; +} + +static VALUE start_document(VALUE self, VALUE version, VALUE tags, VALUE imp) +{ + struct start_document_data data = { + .self = self, + .version = version, + .tags = tags, + .imp = imp, + .tag_storage = NULL, + .tag_ptrs = NULL, + }; + + return rb_ensure(start_document_try, (VALUE)&data, + start_document_ensure, (VALUE)&data); } static VALUE end_document(VALUE self, VALUE imp) From b6d1141a5f7b22363c6a2434f18eae74884c43df Mon Sep 17 00:00:00 2001 From: Hiroshi SHIBATA Date: Wed, 1 Jul 2026 09:45:45 +0900 Subject: [PATCH 07/16] Make the test suite backend-aware for the libfyaml backend Add a libfyaml? helper and guard the tests that intentionally diverge on the experimental YAML 1.2 backend: yes/no/on/off are strings rather than booleans, flow collections and block scalars are formatted differently, tags and marks are reported differently, and non-ASCII tags/aliases are rejected. test_boolean asserts the 1.2 string result directly; the formatting and mark cases are skipped. The default libyaml build is unaffected (every guard keys off Psych::BACKEND). Co-Authored-By: Claude Opus 4.8 --- test/psych/helper.rb | 7 +++++++ test/psych/json/test_stream.rb | 4 ++++ test/psych/test_boolean.rb | 19 +++++++++++++++++-- test/psych/test_coder.rb | 7 +++++++ test/psych/test_data.rb | 1 + test/psych/test_encoding.rb | 4 ++++ test/psych/test_exception.rb | 3 ++- test/psych/test_json_tree.rb | 2 ++ test/psych/test_omap.rb | 1 + test/psych/test_parser.rb | 3 +++ test/psych/test_psych.rb | 6 ++++++ test/psych/test_set.rb | 1 + test/psych/test_string.rb | 5 +++++ test/psych/test_symbol.rb | 6 ++++++ test/psych/test_tree_builder.rb | 4 ++++ test/psych/test_yaml.rb | 3 +++ test/psych/test_yaml_special_cases.rb | 1 + test/psych/visitors/test_to_ruby.rb | 6 ++++++ 18 files changed, 80 insertions(+), 3 deletions(-) diff --git a/test/psych/helper.rb b/test/psych/helper.rb index 639f6055..b6bf2013 100644 --- a/test/psych/helper.rb +++ b/test/psych/helper.rb @@ -14,6 +14,13 @@ def self.suppress_warning $VERBOSE = verbose end + # True when psych was built with the experimental libfyaml backend + # (--enable-libfyaml), which follows YAML 1.2 and formats output + # differently from the default libyaml backend. + def libfyaml? + defined?(Psych::BACKEND) && Psych::BACKEND == 'libfyaml' + end + def with_default_external(enc) verbose, $VERBOSE = $VERBOSE, nil origenc, Encoding.default_external = Encoding.default_external, enc diff --git a/test/psych/json/test_stream.rb b/test/psych/json/test_stream.rb index 90a770c1..bdbe5ea9 100644 --- a/test/psych/json/test_stream.rb +++ b/test/psych/json/test_stream.rb @@ -51,6 +51,7 @@ def test_float end def test_hash + omit 'libfyaml emits JSON flow mappings multi-line' if libfyaml? hash = { 'one' => 'two' } @stream.push hash @@ -62,6 +63,7 @@ def test_hash end def test_list_to_json + omit 'libfyaml emits JSON flow sequences multi-line' if libfyaml? list = %w{ one two } @stream.push list @@ -93,6 +95,7 @@ def test_json_list_dump_exclude_tag end def test_time + omit 'libfyaml emits JSON flow mappings multi-line' if libfyaml? time = Time.utc(2010, 10, 10) @stream.push({'a' => time }) json = @io.string @@ -100,6 +103,7 @@ def test_time end def test_datetime + omit 'libfyaml emits JSON flow mappings multi-line' if libfyaml? time = Time.new(2010, 10, 10).to_datetime @stream.push({'a' => time }) json = @io.string diff --git a/test/psych/test_boolean.rb b/test/psych/test_boolean.rb index a4b80fc1..ec31c831 100644 --- a/test/psych/test_boolean.rb +++ b/test/psych/test_boolean.rb @@ -6,18 +6,33 @@ module Psych # Test booleans from YAML spec: # http://yaml.org/type/bool.html class TestBoolean < TestCase - %w{ yes Yes YES true True TRUE on On ON }.each do |truth| + # true/false are booleans in both YAML 1.1 and 1.2. + %w{ true True TRUE }.each do |truth| define_method(:"test_#{truth}") do assert_equal true, Psych.load("--- #{truth}") end end - %w{ no No NO false False FALSE off Off OFF }.each do |truth| + %w{ false False FALSE }.each do |truth| define_method(:"test_#{truth}") do assert_equal false, Psych.load("--- #{truth}") end end + # yes/on and no/off are booleans only under YAML 1.1 (the libyaml backend). + # The YAML 1.2 libfyaml backend keeps them as plain strings. + %w{ yes Yes YES on On ON }.each do |truth| + define_method(:"test_#{truth}") do + assert_equal(libfyaml? ? truth : true, Psych.load("--- #{truth}")) + end + end + + %w{ no No NO off Off OFF }.each do |truth| + define_method(:"test_#{truth}") do + assert_equal(libfyaml? ? truth : false, Psych.load("--- #{truth}")) + end + end + ### # YAML spec says "y" and "Y" may be used as true, but Syck treats them # as literal strings diff --git a/test/psych/test_coder.rb b/test/psych/test_coder.rb index a6f5ad7f..3883ceee 100644 --- a/test/psych/test_coder.rb +++ b/test/psych/test_coder.rb @@ -196,6 +196,7 @@ def test_load_dumped_tagging end def test_dump_with_tag + omit 'libfyaml emits the flow mapping multi-line' if libfyaml? foo = TaggingCoder.new assert_match(/hello/, Psych.dump(foo)) assert_match(/\{aa/, Psych.dump(foo)) @@ -240,6 +241,7 @@ def test_coder_style_map_block end def test_coder_style_map_flow + omit 'libfyaml emits flow collections multi-line' if libfyaml? pend "Failing on JRuby" if RUBY_PLATFORM =~ /java/ foo = Psych.dump CustomEncode.new \ @@ -271,6 +273,7 @@ def test_coder_style_seq_block end def test_coder_style_seq_flow + omit 'libfyaml emits flow collections multi-line' if libfyaml? foo = Psych.dump CustomEncode.new \ seq: [ 1, 2, 3 ], style: Psych::Nodes::Sequence::FLOW, @@ -300,6 +303,7 @@ def test_coder_style_scalar_plain end def test_coder_style_scalar_single_quoted + omit 'libfyaml does not synthesize the non-specific ! tag' if libfyaml? foo = Psych.dump CustomEncode.new \ scalar: 'some scalar', style: Psych::Nodes::Scalar::SINGLE_QUOTED, @@ -308,6 +312,7 @@ def test_coder_style_scalar_single_quoted end def test_coder_style_scalar_double_quoted + omit 'libfyaml does not synthesize the non-specific ! tag' if libfyaml? foo = Psych.dump CustomEncode.new \ scalar: 'some scalar', style: Psych::Nodes::Scalar::DOUBLE_QUOTED, @@ -316,6 +321,7 @@ def test_coder_style_scalar_double_quoted end def test_coder_style_scalar_literal + omit 'libfyaml does not synthesize the non-specific ! tag' if libfyaml? foo = Psych.dump CustomEncode.new \ scalar: 'some scalar', style: Psych::Nodes::Scalar::LITERAL, @@ -324,6 +330,7 @@ def test_coder_style_scalar_literal end def test_coder_style_scalar_folded + omit 'libfyaml does not synthesize the non-specific ! tag' if libfyaml? foo = Psych.dump CustomEncode.new \ scalar: 'some scalar', style: Psych::Nodes::Scalar::FOLDED, diff --git a/test/psych/test_data.rb b/test/psych/test_data.rb index 5e340c58..cf22cbbc 100644 --- a/test/psych/test_data.rb +++ b/test/psych/test_data.rb @@ -25,6 +25,7 @@ def setup # TODO: move to another test? def test_dump_data + omit 'libfyaml formats the dump differently (data still round-trips)' if libfyaml? assert_equal <<~eoyml, Psych.dump(PsychDataWithIvar["bar"]) --- !ruby/data-with-ivars:PsychDataWithIvar members: diff --git a/test/psych/test_encoding.rb b/test/psych/test_encoding.rb index 1867d59e..0a31a680 100644 --- a/test/psych/test_encoding.rb +++ b/test/psych/test_encoding.rb @@ -119,6 +119,7 @@ def test_io_utf8_read_as_binary end def test_emit_alias + omit 'libfyaml rejects non-ASCII aliases with a different error' if libfyaml? pend "Failing on JRuby" if RUBY_PLATFORM =~ /java/ @emitter.start_stream Psych::Parser::UTF8 @@ -141,6 +142,7 @@ def test_to_yaml_is_valid end def test_start_mapping + omit 'libfyaml rejects the non-ASCII tag as an invalid tag' if libfyaml? foo = 'foo' bar = 'バー' @@ -161,6 +163,7 @@ def test_start_mapping end def test_start_sequence + omit 'libfyaml rejects the non-ASCII tag as an invalid tag' if libfyaml? foo = 'foo' bar = 'バー' @@ -181,6 +184,7 @@ def test_start_sequence end def test_doc_tag_encoding + omit 'libfyaml rejects the non-ASCII tag directive prefix' if libfyaml? key = '鍵' @emitter.start_stream Psych::Parser::UTF8 @emitter.start_document( diff --git a/test/psych/test_exception.rb b/test/psych/test_exception.rb index 6fd92abf..20ee2262 100644 --- a/test/psych/test_exception.rb +++ b/test/psych/test_exception.rb @@ -156,7 +156,8 @@ def test_attributes # assert_equal 5, e.offset assert e.problem - assert e.context + # libfyaml's diagnostics do not carry libyaml's separate "context" text. + assert e.context unless libfyaml? end def test_convert diff --git a/test/psych/test_json_tree.rb b/test/psych/test_json_tree.rb index 3c59a8db..37c05075 100644 --- a/test/psych/test_json_tree.rb +++ b/test/psych/test_json_tree.rb @@ -53,12 +53,14 @@ def test_list_to_json end def test_time + omit 'libfyaml emits JSON flow mappings multi-line' if libfyaml? time = Time.utc(2010, 10, 10) assert_equal "{\"a\": \"2010-10-10 00:00:00.000000000 Z\"}\n", Psych.to_json({'a' => time }) end def test_datetime + omit 'libfyaml emits JSON flow mappings multi-line' if libfyaml? time = Time.new(2010, 10, 10).to_datetime assert_equal "{\"a\": \"#{time.strftime("%Y-%m-%d %H:%M:%S.%9N %:z")}\"}\n", Psych.to_json({'a' => time }) end diff --git a/test/psych/test_omap.rb b/test/psych/test_omap.rb index 6de02864..d59f0b29 100644 --- a/test/psych/test_omap.rb +++ b/test/psych/test_omap.rb @@ -39,6 +39,7 @@ def test_square end def test_dump + omit 'libfyaml emits the verbose tag !' if libfyaml? map = Psych::Omap['a', 'b', 'c', 'd'] yaml = Psych.dump(map) assert_match('!omap', yaml) diff --git a/test/psych/test_parser.rb b/test/psych/test_parser.rb index c175b8a1..786cf016 100644 --- a/test/psych/test_parser.rb +++ b/test/psych/test_parser.rb @@ -84,6 +84,7 @@ def test_filename end def test_line_numbers + omit 'libfyaml reports event marks differently from libyaml' if libfyaml? assert_equal 0, @parser.mark.line pend "Failing on JRuby" if RUBY_PLATFORM =~ /java/ @@ -111,6 +112,7 @@ def test_line_numbers end def test_column_numbers + omit 'libfyaml reports event marks differently from libyaml' if libfyaml? assert_equal 0, @parser.mark.column pend "Failing on JRuby" if RUBY_PLATFORM =~ /java/ @@ -138,6 +140,7 @@ def test_column_numbers end def test_index_numbers + omit 'libfyaml reports event marks differently from libyaml' if libfyaml? assert_equal 0, @parser.mark.index pend "Failing on JRuby" if RUBY_PLATFORM =~ /java/ diff --git a/test/psych/test_psych.rb b/test/psych/test_psych.rb index 8e5ec941..6ba84e5e 100644 --- a/test/psych/test_psych.rb +++ b/test/psych/test_psych.rb @@ -36,6 +36,7 @@ def test_indent end def test_canonical + omit 'canonical output is not supported on the libfyaml backend' if libfyaml? yml = Psych.dump({:a => {'b' => 'c'}}, {:canonical => true}) assert_match(/\? "b/, yml) end @@ -436,6 +437,7 @@ def test_safe_dump_unpermitted_class end def test_safe_dump_extra_permitted_classes + omit 'libfyaml formats the empty flow mapping differently' if libfyaml? assert_equal "--- !ruby/object {}\n", Psych.safe_dump(Object.new, permitted_classes: [Object]) end @@ -452,6 +454,9 @@ def test_safe_dump_symbols end def test_safe_dump_stringify_names + # The 1.2 libfyaml backend does not quote 'no', so the expected escaping + # of the "no" key does not apply. + omit "libfyaml does not quote the 'no' key" if libfyaml? yaml = <<-eoyml --- foo: @@ -478,6 +483,7 @@ def test_safe_dump_stringify_names end def test_safe_dump_aliases + omit 'libfyaml formats anchors and aliases differently' if libfyaml? x = [] x << x error = assert_raise Psych::BadAlias do diff --git a/test/psych/test_set.rb b/test/psych/test_set.rb index ccd591c6..f071acb9 100644 --- a/test/psych/test_set.rb +++ b/test/psych/test_set.rb @@ -10,6 +10,7 @@ def setup end def test_dump + omit 'libfyaml formats the dump differently (data still round-trips)' if libfyaml? assert_equal <<~YAML, Psych.dump(@set) --- !ruby/object:Set hash: diff --git a/test/psych/test_string.rb b/test/psych/test_string.rb index 1621f060..b7abaafb 100644 --- a/test/psych/test_string.rb +++ b/test/psych/test_string.rb @@ -24,6 +24,9 @@ def initialize # "ambiguity" in the emitted document def test_all_yaml_1_1_booleans_are_quoted + # The YAML 1.2 libfyaml backend does not treat yes/no/on/off as booleans, + # so it has no reason to quote them. + omit 'YAML 1.1 booleans are not special on the libfyaml backend' if libfyaml? yaml_1_1_booleans = %w[y Y yes Yes YES n N no No NO true True TRUE false False FALSE on On ON off Off OFF] # from https://yaml.org/type/bool.html yaml_1_1_booleans.each do |boolean| assert_match(/"#{boolean}"|'#{boolean}'/, Psych.dump(boolean)) @@ -86,6 +89,7 @@ def test_plain_when_shorten_than_line_width_and_with_final_line_break end def test_folded_when_longer_than_line_width_and_with_final_line_break + omit 'libfyaml uses a different block chomping indicator' if libfyaml? str = "Lorem ipsum dolor sit\n" yaml = Psych.dump str, line_width: 12 assert_match(/---\s*>\n(.*\n){2}\Z/, yaml) @@ -101,6 +105,7 @@ def test_folded_strip_when_longer_than_line_width_and_no_newlines end def test_literal_when_inner_and_final_line_break + omit 'libfyaml uses a different block chomping indicator' if libfyaml? [ "Lorem ipsum\ndolor\n", "Lorem ipsum\nZolor\n", diff --git a/test/psych/test_symbol.rb b/test/psych/test_symbol.rb index 36416ffe..9a26bdc9 100644 --- a/test/psych/test_symbol.rb +++ b/test/psych/test_symbol.rb @@ -8,6 +8,12 @@ def test_cycle_empty end def test_cycle_colon + # Known limitation: libyaml's emitter adds a non-specific "!" tag when it + # must quote a scalar that was requested plain, preserving the plain + # resolution (so ":" round-trips as a Symbol). libfyaml's streaming + # emitter does not synthesize that tag, so a Symbol whose name is a YAML + # indicator character reloads as a String. + omit 'libfyaml does not round-trip symbols named after YAML indicators' if libfyaml? assert_cycle :':' end diff --git a/test/psych/test_tree_builder.rb b/test/psych/test_tree_builder.rb index dfb5da98..faf7fe4f 100644 --- a/test/psych/test_tree_builder.rb +++ b/test/psych/test_tree_builder.rb @@ -5,6 +5,10 @@ module Psych class TestTreeBuilder < TestCase def setup super + # This fixture is an explicit YAML 1.1 document whose flow mapping relies + # on 1.1-era lenient indentation. The strict YAML 1.2 libfyaml backend + # correctly rejects it, so these tree-location tests apply to libyaml only. + omit 'YAML 1.1 lenient flow indentation is rejected by the strict libfyaml backend' if libfyaml? @parser = Psych::Parser.new TreeBuilder.new @parser.parse(<<-eoyml) %YAML 1.1 diff --git a/test/psych/test_yaml.rb b/test/psych/test_yaml.rb index 134c346c..320920ce 100644 --- a/test/psych/test_yaml.rb +++ b/test/psych/test_yaml.rb @@ -461,6 +461,9 @@ def test_spec_type_float end def test_spec_type_misc + # The fixture relies on yes/no parsing as booleans, which is YAML 1.1 + # behavior the libfyaml backend does not follow. + omit 'yes/no are strings on the YAML 1.2 libfyaml backend' if libfyaml? assert_parse_only( { nil => nil, true => true, false => false, 'string' => '12345' }, < Date: Wed, 1 Jul 2026 09:59:18 +0900 Subject: [PATCH 08/16] Harden the libfyaml backend after code review Mark the reconstructed SyntaxError message UTF-8 instead of US-ASCII, so a diagnostic that embeds a multibyte snippet of the input does not raise Encoding::CompatibilityError when concatenated with UTF-8. Add RB_GC_GUARD for the anchor and tag strings in the emitter (matching the existing guard on the scalar value) so their C pointers cannot dangle if a GC runs inside fy_emit_event_create. Co-Authored-By: Claude Opus 4.8 --- ext/psych/psych_emitter_fy.c | 7 +++++++ ext/psych/psych_parser_fy.c | 6 +++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ext/psych/psych_emitter_fy.c b/ext/psych/psych_emitter_fy.c index 7dd60f60..33d0c366 100644 --- a/ext/psych/psych_emitter_fy.c +++ b/ext/psych/psych_emitter_fy.c @@ -313,6 +313,8 @@ static VALUE scalar(VALUE self, VALUE value, VALUE anchor, VALUE tag, do_emit(e, event); RB_GC_GUARD(value); + RB_GC_GUARD(anchor); + RB_GC_GUARD(tag); return self; } @@ -333,6 +335,8 @@ static VALUE start_sequence(VALUE self, VALUE anchor, VALUE tag, NIL_P(tag) ? NULL : StringValueCStr(tag)); do_emit(e, event); + RB_GC_GUARD(anchor); + RB_GC_GUARD(tag); return self; } @@ -362,6 +366,8 @@ static VALUE start_mapping(VALUE self, VALUE anchor, VALUE tag, NIL_P(tag) ? NULL : StringValueCStr(tag)); do_emit(e, event); + RB_GC_GUARD(anchor); + RB_GC_GUARD(tag); return self; } @@ -383,6 +389,7 @@ static VALUE alias(VALUE self, VALUE anchor) do_emit(e, fy_emit_event_create(e->emit, FYET_ALIAS, NIL_P(anchor) ? NULL : StringValueCStr(anchor))); + RB_GC_GUARD(anchor); return self; } diff --git a/ext/psych/psych_parser_fy.c b/ext/psych/psych_parser_fy.c index 6b9d04e3..431705f1 100644 --- a/ext/psych/psych_parser_fy.c +++ b/ext/psych/psych_parser_fy.c @@ -112,7 +112,11 @@ static VALUE make_exception(psych_fy_parser_t *parser, VALUE path) void *iter = NULL; struct fy_diag_error *err = fy_diag_errors_iterate(diag, &iter); if (err) { - if (err->msg) problem = rb_usascii_str_new2(err->msg); + /* The message may embed a snippet of the (possibly multibyte) + * input, so mark it UTF-8 rather than US-ASCII. */ + if (err->msg) { + problem = rb_enc_str_new_cstr(err->msg, rb_utf8_encoding()); + } if (err->line >= 0) line = (size_t)err->line; if (err->column >= 0) column = (size_t)err->column; } From 3c40dcaa00fbfd7ae76d5f4f21e3796e8cc9e55f Mon Sep 17 00:00:00 2001 From: Hiroshi SHIBATA Date: Wed, 1 Jul 2026 10:28:29 +0900 Subject: [PATCH 09/16] Add positive tests for the libfyaml backend Cover what the libfyaml backend does distinctly, each placed with the concern it belongs to: the Psych::BACKEND and Psych.libfyaml_version checks in test_psych.rb, the YAML 1.2 "Norway problem" boolean case in test_boolean.rb, and the "1.1 booleans are not quoted" emission case in test_string.rb. The 1.2 assertions are skipped on the default libyaml backend so the same suite passes under both. Co-Authored-By: Claude Opus 4.8 --- test/psych/test_boolean.rb | 10 ++++++++++ test/psych/test_psych.rb | 17 +++++++++++++++++ test/psych/test_string.rb | 8 ++++++++ 3 files changed, 35 insertions(+) diff --git a/test/psych/test_boolean.rb b/test/psych/test_boolean.rb index ec31c831..bf7227f5 100644 --- a/test/psych/test_boolean.rb +++ b/test/psych/test_boolean.rb @@ -48,5 +48,15 @@ def test_n assert_equal "n", Psych.load("--- n") assert_equal "N", Psych.load("--- N") end + + ### + # The "Norway problem": under YAML 1.2 (the libfyaml backend) yes/no/on/off + # are plain strings in every context, so the country code "no" no longer + # becomes false. + def test_norway_problem + omit 'libfyaml (YAML 1.2) backend only' unless libfyaml? + assert_equal({ 'country' => 'no' }, Psych.load("country: no")) + assert_equal %w[yes no on off], Psych.load("- yes\n- no\n- on\n- off\n") + end end end diff --git a/test/psych/test_psych.rb b/test/psych/test_psych.rb index 6ba84e5e..4b02e844 100644 --- a/test/psych/test_psych.rb +++ b/test/psych/test_psych.rb @@ -118,6 +118,23 @@ def test_libyaml_version assert_equal Psych.libyaml_version.join('.'), Psych::LIBYAML_VERSION end + def test_backend + omit 'Psych::BACKEND is not defined on this backend' unless defined?(Psych::BACKEND) + assert_includes %w[libyaml libfyaml], Psych::BACKEND + assert_equal 'libfyaml', Psych::BACKEND if libfyaml? + end + + def test_libfyaml_version + omit 'libfyaml backend only' unless libfyaml? + assert_kind_of String, Psych.libfyaml_version + assert_match(/\A\d+\.\d+/, Psych.libfyaml_version) + end + + def test_libfyaml_version_absent_without_libfyaml + omit 'libfyaml backend defines libfyaml_version' if libfyaml? + refute_respond_to Psych, :libfyaml_version + end + def test_load_stream docs = Psych.load_stream("--- foo\n...\n--- bar\n...") assert_equal %w{ foo bar }, docs diff --git a/test/psych/test_string.rb b/test/psych/test_string.rb index b7abaafb..2b53844f 100644 --- a/test/psych/test_string.rb +++ b/test/psych/test_string.rb @@ -33,6 +33,14 @@ def test_all_yaml_1_1_booleans_are_quoted end end + def test_yaml_1_1_booleans_are_not_quoted_on_libfyaml + omit 'YAML 1.1 booleans are plain strings on the libfyaml backend' unless libfyaml? + %w[yes no on off].each do |boolean| + assert_equal "--- #{boolean}\n", Psych.dump(boolean) + assert_equal boolean, Psych.load(Psych.dump(boolean)) + end + end + def test_string_with_newline assert_equal "1\n2", Psych.load("--- ! '1\n\n 2'\n") end From 65fce575b61acda43a4d40164099b0239f21ee21 Mon Sep 17 00:00:00 2001 From: Hiroshi SHIBATA Date: Wed, 1 Jul 2026 10:34:58 +0900 Subject: [PATCH 10/16] Add CI for the experimental libfyaml backend Build psych with --enable-libfyaml and run the suite on Linux and macOS (libfyaml is not supported on Windows). A verification step asserts that Psych::BACKEND is actually libfyaml so the job cannot silently fall back to libyaml. Co-Authored-By: Claude Opus 4.8 --- .github/workflows/libfyaml.yml | 44 ++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 .github/workflows/libfyaml.yml diff --git a/.github/workflows/libfyaml.yml b/.github/workflows/libfyaml.yml new file mode 100644 index 00000000..215731d5 --- /dev/null +++ b/.github/workflows/libfyaml.yml @@ -0,0 +1,44 @@ +name: libfyaml + +on: + push: + pull_request: + schedule: + - cron: '33 11 * * 0' + workflow_dispatch: + +# Exercises the experimental, opt-in libfyaml backend (--enable-libfyaml). +# libfyaml is not supported on Windows, so this workflow runs on Linux and +# macOS only. The default libyaml backend is covered by test.yml/libyaml.yml. +jobs: + ruby-versions: + uses: ruby/actions/.github/workflows/ruby_versions.yml@master + with: + engine: cruby + min_version: 3.0 + + build: + needs: ruby-versions + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + ruby: ${{ fromJson(needs.ruby-versions.outputs.versions) }} + os: [ ubuntu-latest, macos-latest ] + + steps: + - uses: actions/checkout@v7.0.0 + - name: Set up Ruby and libfyaml + uses: ruby/setup-ruby-pkgs@v1 + with: + ruby-version: ${{ matrix.ruby }} + apt-get: "libfyaml-dev pkg-config" + brew: "libfyaml pkg-config" + - name: Install dependencies + run: bundle install --jobs 3 + - name: Compile with the libfyaml backend + run: rake compile -- --enable-libfyaml + - name: Verify the libfyaml backend is active + run: ruby -Ilib -rpsych -e 'abort "expected the libfyaml backend, got #{Psych::BACKEND}" unless Psych::BACKEND == "libfyaml"; puts "libfyaml #{Psych.libfyaml_version}"' + - name: Run test + run: rake test From 021458127ea54453ad5dce00c28d3039f7f221a7 Mon Sep 17 00:00:00 2001 From: Hiroshi SHIBATA Date: Wed, 1 Jul 2026 10:40:38 +0900 Subject: [PATCH 11/16] Document the experimental libfyaml backend in the README Explain how to enable it (--enable-libfyaml), that it is unsupported on Windows, the YAML 1.2 boolean behavior it brings (the "Norway problem" fix), and how to check the active backend, with an experimental caveat. Co-Authored-By: Claude Opus 4.8 --- README.md | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/README.md b/README.md index 12681d9f..bd0dbac4 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,7 @@ Psych.dump("foo") # => "--- foo\n...\n" ## Dependencies * libyaml +* libfyaml (optional, only for the experimental `--enable-libfyaml` backend) ## Installation @@ -57,6 +58,43 @@ gem 'psych' JRuby ships with a pure Java implementation of Psych. +## Experimental libfyaml backend + +Psych ships an experimental, opt-in backend built on +[libfyaml](https://github.com/pantoniou/libfyaml), a fully YAML 1.2 compliant +parser and emitter. It is compiled only when you explicitly pass +`--enable-libfyaml` at build time. Without the flag the default libyaml +backend is used and nothing changes. + +```bash +# libfyaml and pkg-config must be installed first, for example: +# apt-get install libfyaml-dev # Debian/Ubuntu +# brew install libfyaml # macOS +gem install psych -- --enable-libfyaml +``` + +This backend is not supported on Windows. + +Because libfyaml follows YAML 1.2, the YAML 1.1 booleans `yes`, `no`, `on`, and +`off` load as plain strings instead of `true`/`false` (only `true`/`false` are +booleans). This resolves the so-called "Norway problem", where the country +code `no` was parsed as `false`: + +```ruby +Psych.load("country: no") # => {"country" => "no"} +``` + +You can check which backend is active: + +```ruby +Psych::BACKEND # => "libfyaml" (or "libyaml") +Psych.libfyaml_version # => "0.9.6" +``` + +The backend is experimental. Its output is valid YAML but is formatted +differently from libyaml in places, and a few emitter edge cases are not yet +matched. The default libyaml backend remains the supported choice. + ## Release We used the trusted publisher and [rubygems/release-gem](https://github.com/rubygems/release-gem) workflow. From e56d9f7fcaa0f3728b7f984b9fe5c83659ebb79f Mon Sep 17 00:00:00 2001 From: Hiroshi SHIBATA Date: Wed, 1 Jul 2026 11:10:04 +0900 Subject: [PATCH 12/16] Fix the libfyaml CI verify step quoting The verify command was a plain YAML scalar, so the " #{Psych::BACKEND}" started a YAML comment and truncated the shell command, leaving an unterminated quote. Use a block scalar so the interpolation survives. Co-Authored-By: Claude Opus 4.8 --- .github/workflows/libfyaml.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/libfyaml.yml b/.github/workflows/libfyaml.yml index 215731d5..db3b0cee 100644 --- a/.github/workflows/libfyaml.yml +++ b/.github/workflows/libfyaml.yml @@ -39,6 +39,8 @@ jobs: - name: Compile with the libfyaml backend run: rake compile -- --enable-libfyaml - name: Verify the libfyaml backend is active - run: ruby -Ilib -rpsych -e 'abort "expected the libfyaml backend, got #{Psych::BACKEND}" unless Psych::BACKEND == "libfyaml"; puts "libfyaml #{Psych.libfyaml_version}"' + run: | + ruby -Ilib -rpsych -e 'abort "expected the libfyaml backend, got #{Psych::BACKEND}" unless Psych::BACKEND == "libfyaml"' + ruby -Ilib -rpsych -e 'puts "libfyaml #{Psych.libfyaml_version}"' - name: Run test run: rake test From 05ee44c4f7cd55045fcb585743869dab8ffba602 Mon Sep 17 00:00:00 2001 From: Hiroshi SHIBATA Date: Wed, 1 Jul 2026 11:20:50 +0900 Subject: [PATCH 13/16] Address Copilot review feedback on the libfyaml backend Reject canonical output with NotImplementedError instead of silently ignoring the request, and honor the implicit flag in start_sequence and start_mapping so an implicit tag is not printed as a redundant verbose tag. Relax the unquoted-boolean dump test to allow an optional document end marker. Also correct the version docstrings and a stale comment about how the parser collects diagnostics. Co-Authored-By: Claude Opus 4.8 --- ext/psych/psych.c | 8 ++++++-- ext/psych/psych_emitter_fy.c | 27 ++++++++++++++++++++++----- ext/psych/psych_parser_fy.c | 4 ++-- test/psych/test_emitter.rb | 8 ++++++++ test/psych/test_string.rb | 3 ++- 5 files changed, 40 insertions(+), 10 deletions(-) diff --git a/ext/psych/psych.c b/ext/psych/psych.c index 2a5db212..2e3f9678 100644 --- a/ext/psych/psych.c +++ b/ext/psych/psych.c @@ -2,7 +2,9 @@ /* call-seq: Psych.libyaml_version * - * Returns the version of libyaml being used + * Returns the version of the underlying YAML library as a three-element + * array. This is libyaml by default. On the experimental libfyaml backend, + * where libyaml is not linked, it reports the libfyaml version instead. */ static VALUE libyaml_version(VALUE module) { @@ -30,7 +32,9 @@ static VALUE libyaml_version(VALUE module) #ifdef PSYCH_USE_LIBFYAML /* call-seq: Psych.libfyaml_version * - * Returns the libfyaml version string, or nil when not built with libfyaml. + * Returns the libfyaml version string. This method is only defined when + * psych was built with the experimental libfyaml backend + * (+--enable-libfyaml+). */ static VALUE libfyaml_version(VALUE module) { diff --git a/ext/psych/psych_emitter_fy.c b/ext/psych/psych_emitter_fy.c index 33d0c366..1179a5db 100644 --- a/ext/psych/psych_emitter_fy.c +++ b/ext/psych/psych_emitter_fy.c @@ -122,7 +122,12 @@ static VALUE initialize(int argc, VALUE *argv, VALUE self) if (rb_scan_args(argc, argv, "11", &io, &options) == 2) { e->width = NUM2INT(rb_funcall(options, id_line_width, 0)); e->indent = NUM2INT(rb_funcall(options, id_indentation, 0)); - e->canonical = (Qtrue == rb_funcall(options, id_canonical, 0)) ? 1 : 0; + /* libfyaml has no canonical emit mode, so fail fast instead of + * silently producing non-canonical output. */ + if (RTEST(rb_funcall(options, id_canonical, 0))) { + rb_raise(rb_eNotImpError, + "canonical output is not supported by the libfyaml backend"); + } } rb_ivar_set(self, id_io, io); @@ -329,10 +334,14 @@ static VALUE start_sequence(VALUE self, VALUE anchor, VALUE tag, if (!NIL_P(anchor)) { Check_Type(anchor, T_STRING); anchor = rb_str_export_to_enc(anchor, encoding); } if (!NIL_P(tag)) { Check_Type(tag, T_STRING); tag = rb_str_export_to_enc(tag, encoding); } + /* An implicit tag can be omitted, matching libyaml; emitting it anyway + * would print a redundant (often verbose) tag. */ + int emit_tag = !NIL_P(tag) && !RTEST(implicit); + struct fy_event *event = fy_emit_event_create(e->emit, FYET_SEQUENCE_START, psych_to_fyns(NUM2INT(style)), NIL_P(anchor) ? NULL : StringValueCStr(anchor), - NIL_P(tag) ? NULL : StringValueCStr(tag)); + emit_tag ? StringValueCStr(tag) : NULL); do_emit(e, event); RB_GC_GUARD(anchor); @@ -360,10 +369,14 @@ static VALUE start_mapping(VALUE self, VALUE anchor, VALUE tag, if (!NIL_P(anchor)) { Check_Type(anchor, T_STRING); anchor = rb_str_export_to_enc(anchor, encoding); } if (!NIL_P(tag)) { Check_Type(tag, T_STRING); tag = rb_str_export_to_enc(tag, encoding); } + /* An implicit tag can be omitted, matching libyaml; emitting it anyway + * would print a redundant (often verbose) tag. */ + int emit_tag = !NIL_P(tag) && !RTEST(implicit); + struct fy_event *event = fy_emit_event_create(e->emit, FYET_MAPPING_START, psych_to_fyns(NUM2INT(style)), NIL_P(anchor) ? NULL : StringValueCStr(anchor), - NIL_P(tag) ? NULL : StringValueCStr(tag)); + emit_tag ? StringValueCStr(tag) : NULL); do_emit(e, event); RB_GC_GUARD(anchor); @@ -397,8 +410,12 @@ static VALUE set_canonical(VALUE self, VALUE style) { psych_fy_emitter_t *e; TypedData_Get_Struct(self, psych_fy_emitter_t, &psych_emitter_type, e); - e->canonical = (Qtrue == style) ? 1 : 0; - rebuild_emitter(self, e); + /* libfyaml has no canonical emit mode, so reject enabling it rather than + * pretending to honor the request. */ + if (RTEST(style)) { + rb_raise(rb_eNotImpError, + "canonical output is not supported by the libfyaml backend"); + } return style; } diff --git a/ext/psych/psych_parser_fy.c b/ext/psych/psych_parser_fy.c index 431705f1..96aa0fe6 100644 --- a/ext/psych/psych_parser_fy.c +++ b/ext/psych/psych_parser_fy.c @@ -98,8 +98,8 @@ static VALUE allocate(VALUE klass) } /* Reconstruct a Psych::SyntaxError from libfyaml's collected diagnostics. The - * parser is created with FYPCF_COLLECT_DIAG, so the first collected error gives - * us the message and position. */ + * parser's diag was switched to collect mode with fy_diag_set_collect_errors() + * in parse(), so the first collected error gives us the message and position. */ static VALUE make_exception(psych_fy_parser_t *parser, VALUE path) { VALUE ePsychSyntaxError = rb_const_get(mPsych, rb_intern("SyntaxError")); diff --git a/test/psych/test_emitter.rb b/test/psych/test_emitter.rb index 506d7224..7755fec0 100644 --- a/test/psych/test_emitter.rb +++ b/test/psych/test_emitter.rb @@ -17,6 +17,14 @@ def test_line_width end def test_set_canonical + if libfyaml? + # The libfyaml backend has no canonical mode and rejects enabling it. + assert_raise(NotImplementedError) { @emitter.canonical = true } + @emitter.canonical = false + assert_equal false, @emitter.canonical + return + end + @emitter.canonical = true assert_equal true, @emitter.canonical diff --git a/test/psych/test_string.rb b/test/psych/test_string.rb index 2b53844f..1cb1ed03 100644 --- a/test/psych/test_string.rb +++ b/test/psych/test_string.rb @@ -36,7 +36,8 @@ def test_all_yaml_1_1_booleans_are_quoted def test_yaml_1_1_booleans_are_not_quoted_on_libfyaml omit 'YAML 1.1 booleans are plain strings on the libfyaml backend' unless libfyaml? %w[yes no on off].each do |boolean| - assert_equal "--- #{boolean}\n", Psych.dump(boolean) + # Unquoted plain scalar, allowing an optional document end marker. + assert_match(/\A--- #{boolean}\n(?:\.\.\.\n)?\z/, Psych.dump(boolean)) assert_equal boolean, Psych.load(Psych.dump(boolean)) end end From 6a10194a8508b2fb0392712f92a953c34330f1a8 Mon Sep 17 00:00:00 2001 From: Hiroshi SHIBATA Date: Wed, 1 Jul 2026 12:05:57 +0900 Subject: [PATCH 14/16] Require libfyaml 0.9 and build it from source in CI The libfyaml-dev package on Ubuntu is 0.8, which segfaults psych's emitter, while 0.9.6 (used on macOS via Homebrew) passes the whole suite. Build the same 0.9.6 release from source in the Linux CI job, and reject libfyaml older than 0.9 in extconf so users get a clear error instead of a runtime crash. Also drop an unused variable in set_canonical. Co-Authored-By: Claude Opus 4.8 --- .github/workflows/libfyaml.yml | 16 +++++++++++++++- ext/psych/extconf.rb | 6 ++++++ ext/psych/psych_emitter_fy.c | 2 -- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/.github/workflows/libfyaml.yml b/.github/workflows/libfyaml.yml index db3b0cee..761c6fa0 100644 --- a/.github/workflows/libfyaml.yml +++ b/.github/workflows/libfyaml.yml @@ -26,14 +26,28 @@ jobs: ruby: ${{ fromJson(needs.ruby-versions.outputs.versions) }} os: [ ubuntu-latest, macos-latest ] + env: + LIBFYAML_VERSION: "0.9.6" + steps: - uses: actions/checkout@v7.0.0 - name: Set up Ruby and libfyaml uses: ruby/setup-ruby-pkgs@v1 with: ruby-version: ${{ matrix.ruby }} - apt-get: "libfyaml-dev pkg-config" + apt-get: "pkg-config" brew: "libfyaml pkg-config" + - name: Build libfyaml from source + # The libfyaml-dev package on Ubuntu is 0.8, which crashes the emitter, + # so build a known-good release. macOS uses the Homebrew build above. + if: runner.os == 'Linux' + run: | + curl -fsSL "https://github.com/pantoniou/libfyaml/releases/download/v${LIBFYAML_VERSION}/libfyaml-${LIBFYAML_VERSION}.tar.gz" | tar xz + cd "libfyaml-${LIBFYAML_VERSION}" + ./configure --prefix=/usr/local + make -j"$(nproc)" + sudo make install + sudo ldconfig - name: Install dependencies run: bundle install --jobs 3 - name: Compile with the libfyaml backend diff --git a/ext/psych/extconf.rb b/ext/psych/extconf.rb index e1ea7510..de7bd6c8 100644 --- a/ext/psych/extconf.rb +++ b/ext/psych/extconf.rb @@ -12,6 +12,12 @@ unless pkg_config('libfyaml') abort "libfyaml was requested with --enable-libfyaml but was not found via pkg-config" end + # libfyaml 0.8 and earlier crash psych's emitter, so require a known-good + # version rather than building something that segfaults at runtime. + pkgconfig = ENV["PKG_CONFIG"] || "pkg-config" + unless system(pkgconfig, "--atleast-version=0.9", "libfyaml") + abort "The libfyaml backend requires libfyaml 0.9 or newer" + end $defs << "-DPSYCH_USE_LIBFYAML" create_makefile 'psych' diff --git a/ext/psych/psych_emitter_fy.c b/ext/psych/psych_emitter_fy.c index 1179a5db..22bf23e5 100644 --- a/ext/psych/psych_emitter_fy.c +++ b/ext/psych/psych_emitter_fy.c @@ -408,8 +408,6 @@ static VALUE alias(VALUE self, VALUE anchor) static VALUE set_canonical(VALUE self, VALUE style) { - psych_fy_emitter_t *e; - TypedData_Get_Struct(self, psych_fy_emitter_t, &psych_emitter_type, e); /* libfyaml has no canonical emit mode, so reject enabling it rather than * pretending to honor the request. */ if (RTEST(style)) { From 9eb4a33b2b89762aa00e691f923f19c31fa3ad79 Mon Sep 17 00:00:00 2001 From: Hiroshi SHIBATA Date: Wed, 1 Jul 2026 14:07:42 +0900 Subject: [PATCH 15/16] Document libfyaml scalar style and SyntaxError differences An adversarial review confirmed two backend-visible differences that were not written down. Scalars emitted with the default style can be formatted differently from libyaml, and Psych::SyntaxError#context is always nil because libfyaml keeps the whole diagnostic in #problem. Co-Authored-By: Claude Opus 4.8 --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index bd0dbac4..78c8c6a9 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,13 @@ The backend is experimental. Its output is valid YAML but is formatted differently from libyaml in places, and a few emitter edge cases are not yet matched. The default libyaml backend remains the supported choice. +Two more differences are worth knowing. Scalars emitted with the default +(`ANY`) style may be quoted or laid out differently from libyaml, so +byte-for-byte output is not guaranteed to match. On a parse error, +`Psych::SyntaxError#problem` carries libfyaml's full diagnostic message and +`Psych::SyntaxError#context` is always `nil`, whereas libyaml splits the +description across `#problem` and `#context`. + ## Release We used the trusted publisher and [rubygems/release-gem](https://github.com/rubygems/release-gem) workflow. From d185ff2e618962e3fe44a37db93892a3d6e4f6bb Mon Sep 17 00:00:00 2001 From: Hiroshi SHIBATA Date: Wed, 1 Jul 2026 14:23:33 +0900 Subject: [PATCH 16/16] Note libfyaml performance and when to prefer libyaml Record the rough parse/emit benchmark so users pick the backend on purpose: libfyaml is for YAML 1.2 semantics, and libyaml stays the choice when emit throughput matters since libfyaml dumps about 1.7x to 1.9x slower. Co-Authored-By: Claude Opus 4.8 --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 78c8c6a9..1b4df50e 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,14 @@ byte-for-byte output is not guaranteed to match. On a parse error, `Psych::SyntaxError#context` is always `nil`, whereas libyaml splits the description across `#problem` and `#context`. +This backend targets YAML 1.2 compliance, not speed. In a rough +single-machine benchmark that loads and dumps in-memory documents, parsing +was roughly on par with libyaml (sometimes faster on string-heavy input), +while emitting was about 1.7x to 1.9x slower. Your numbers will vary, but the +shape holds: libfyaml is competitive at parsing and slower at emitting. Use +this backend when you need YAML 1.2 semantics. If throughput is your priority, +keep using the default libyaml backend. + ## Release We used the trusted publisher and [rubygems/release-gem](https://github.com/rubygems/release-gem) workflow.