Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 3 additions & 12 deletions docs/serialization.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ Every field on the node is then appended to the serialized string. The fields ca
* `node` - A field that is a node. This is structured just as like parent node.
* `node?` - A field that is a node that is optionally present. If the node is not present, then a single `0` byte will be written in its place. If it is present, then it will be structured just as like parent node.
* `node[]` - A field that is an array of nodes. This is structured as a variable-length integer length, followed by the child nodes themselves.
* `string` - A field that is a string. For example, this is used as the name of the method in a call node, since it cannot directly reference the source string (as in `@-` or `foo=`). This is structured as a variable-length integer byte length, followed by the string itself (_without_ a trailing null byte).
* `string` - A field that is a string. For example, this is used as the name of the method in a call node, since it cannot directly reference the source string (as in `@-` or `foo=`). This is structured as a variable-length integer byte length, followed by the string bytes (_without_ a trailing null byte).
* `constant` - A variable-length integer that represents an index in the constant pool.
* `constant?` - An optional variable-length integer that represents an index in the constant pool. If it's not present, then a single `0` byte will be written in its place.
* `integer` - A field that represents an arbitrary-sized integer. The structure is listed above.
Expand All @@ -135,23 +135,14 @@ Every field on the node is then appended to the serialized string. The fields ca
* `uint8` - A field that is an 8-bit unsigned integer. This is structured as a single byte.
* `uint32` - A field that is a 32-bit unsigned integer. This is structured as a variable-length integer.

After the syntax tree, the content pool is serialized. This is a list of constants that were referenced from within the tree. The content pool begins at the offset specified in the header. Constants can be either "owned" (in which case their contents are embedded in the serialization) or "shared" (in which case their contents represent a slice of the source string). The most significant bit of the constant indicates whether it is owned or shared.

In the case that it is owned, the constant is structured as follows:
After the syntax tree, the content pool is serialized. This is a list of constants that were referenced from within the tree. The content pool begins at the offset specified in the header. Every constant is embedded in the serialization. Each constant is structured as follows:

| # bytes | field |
| --- | --- |
| `4` | the byte offset in the serialization for the contents of the constant |
| `4` | the byte length in the serialization |

Note that you will need to mask off the most significant bit for the byte offset in the serialization. In the case that it is shared, the constant is structured as follows:

| # bytes | field |
| --- | --- |
| `4` | the byte offset in the source string for the contents of the constant |
| `4` | the byte length in the source string |

After the constant pool, the contents of the owned constants are serialized. This is just a sequence of bytes that represent the contents of the constants. At the end of the serialization, the buffer is null terminated.
After the constant pool, the contents of the constants are serialized. This is just a sequence of bytes that represent the contents of the constants. At the end of the serialization, the buffer is null terminated.

## APIs

Expand Down
2 changes: 1 addition & 1 deletion javascript/src/parsePrism.js
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ export function parsePrism(prism, source, options = {}) {

prism.pm_serialize_parse(bufferPointer, sourcePointer, sourceArray.length, optionsPointer);
const serializedView = new Uint8Array(prism.memory.buffer, prism.pm_buffer_value(bufferPointer), prism.pm_buffer_length(bufferPointer));
const result = deserialize(sourceArray, serializedView);
const result = deserialize(serializedView);

prism.pm_buffer_free(bufferPointer);
prism.free(sourcePointer);
Expand Down
40 changes: 9 additions & 31 deletions templates/java/org/ruby_lang/prism/Loader.java.erb
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,11 @@ public class Loader {
private static final class ConstantPool {

private final Loader loader;
private final byte[] source;
private final int bufferOffset;
private final <%= string_type %>[] cache;

ConstantPool(Loader loader, byte[] source, int bufferOffset, int length) {
ConstantPool(Loader loader, int bufferOffset, int length) {
this.loader = loader;
this.source = source;
this.bufferOffset = bufferOffset;
cache = new <%= string_type %>[length];
}
Expand All @@ -61,15 +59,10 @@ public class Loader {
int length = buffer.getInt(offset + 4);

byte[] bytes = new byte[length];

if (Integer.compareUnsigned(start, 0x7FFFFFFF) <= 0) {
System.arraycopy(source, start, bytes, 0, length);
} else {
int position = buffer.position();
buffer.position(start & 0x7FFFFFFF);
buffer.get(bytes, 0, length);
buffer.position(position);
}
int position = buffer.position();
buffer.position(start);
buffer.get(bytes, 0, length);
buffer.position(position);

constant = loader.bytesToName(bytes);
cache[index] = constant;
Expand Down Expand Up @@ -125,7 +118,7 @@ public class Loader {

int constantPoolBufferOffset = buffer.getInt();
int constantPoolLength = loadVarUInt();
this.constantPool = new ConstantPool(this, source.bytes, constantPoolBufferOffset, constantPoolLength);
this.constantPool = new ConstantPool(this, constantPoolBufferOffset, constantPoolLength);

Nodes.Node node;
if (errors.length == 0) {
Expand All @@ -146,28 +139,13 @@ public class Loader {
return new ParseResult(node, magicComments, dataLocation, errors, warnings, source);
}

private byte[] loadEmbeddedString() {
private byte[] loadString() {
int length = loadVarUInt();
byte[] bytes = new byte[length];
buffer.get(bytes);
return bytes;
}

private byte[] loadString() {
switch (buffer.get()) {
case 1:
int start = loadVarUInt();
int length = loadVarUInt();
byte[] bytes = new byte[length];
System.arraycopy(source.bytes, start, bytes, 0, length);
return bytes;
case 2:
return loadEmbeddedString();
default:
throw new Error("Expected 0 or 1 but was " + buffer.get());
}
}

private int[] loadLineOffsets() {
int count = loadVarUInt();
int[] lineOffsets = new int[count];
Expand Down Expand Up @@ -199,7 +177,7 @@ public class Loader {
// error messages only contain ASCII characters
for (int i = 0; i < count; i++) {
Nodes.ErrorType type = Nodes.ERROR_TYPES[loadVarUInt()];
byte[] bytes = loadEmbeddedString();
byte[] bytes = loadString();
String message = new String(bytes, StandardCharsets.US_ASCII);
Nodes.Location location = loadLocation();
ParseResult.ErrorLevel level = ParseResult.ERROR_LEVELS[buffer.get()];
Expand All @@ -218,7 +196,7 @@ public class Loader {
// warning messages only contain ASCII characters
for (int i = 0; i < count; i++) {
Nodes.WarningType type = Nodes.WARNING_TYPES[loadVarUInt() - <%= errors.length %>];
byte[] bytes = loadEmbeddedString();
byte[] bytes = loadString();
String message = new String(bytes, StandardCharsets.US_ASCII);
Nodes.Location location = loadLocation();
ParseResult.WarningLevel level = ParseResult.WARNING_LEVELS[buffer.get()];
Expand Down
31 changes: 6 additions & 25 deletions templates/javascript/src/deserialize.js.erb
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@ class SerializationBuffer {
["ascii-8bit", "ascii"]
]);

constructor(source, array) {
this.source = source;
constructor(array) {
this.array = array;
this.index = 0;
this.fileEncoding = "utf-8";
Expand Down Expand Up @@ -96,32 +95,15 @@ class SerializationBuffer {

readStringField(flags) {
if (flags === undefined) flags = 0;
const type = this.readByte();

switch (type) {
case 1: {
const startOffset = this.readVarInt();
const length = this.readVarInt();
return this.decodeString(this.source.slice(startOffset, startOffset + length), flags);
}
case 2:
return this.decodeString(this.readBytes(this.readVarInt()), flags);
default:
throw new Error(`Unknown serialized string type: ${type}`);
}
return this.decodeString(this.readBytes(this.readVarInt()), flags);
}

scanConstant(constantPoolOffset, constantIndex) {
const offset = constantPoolOffset + constantIndex * 8;
let startOffset = this.scanUint32(offset);
const startOffset = this.scanUint32(offset);
const length = this.scanUint32(offset + 4);

if (startOffset & (1 << 31)) {
startOffset &= (1 << 31) - 1;
return new TextDecoder().decode(this.array.slice(startOffset, startOffset + length));
} else {
return new TextDecoder().decode(this.source.slice(startOffset, startOffset + length));
}
return this.getDecoder(this.fileEncoding).decode(this.array.slice(startOffset, startOffset + length));
}

readDouble() {
Expand Down Expand Up @@ -293,13 +275,12 @@ const warningTypes = [
* Accept two Uint8Arrays, one for the source and one for the serialized format.
* Return the AST corresponding to the serialized form.
*
* @param {Uint8Array} source
* @param {Uint8Array} array
* @returns {ParseResult}
* @throws {Error}
*/
export function deserialize(source, array) {
const buffer = new SerializationBuffer(source, array);
export function deserialize(array) {
const buffer = new SerializationBuffer(array);

if (buffer.readString(5) !== "PRISM") {
throw new Error("Invalid serialization");
Expand Down
38 changes: 10 additions & 28 deletions templates/lib/prism/serialize.rb.erb
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ module Prism
cpool_base = loader.load_uint32
cpool_size = loader.load_varuint

constant_pool = ConstantPool.new(input, serialized, cpool_base, cpool_size)
constant_pool = ConstantPool.new(serialized, cpool_base, cpool_size)

node = loader.load_node(constant_pool, encoding, freeze) #: ProgramNode
loader.load_constant_pool(constant_pool)
Expand Down Expand Up @@ -171,7 +171,7 @@ module Prism
cpool_base = loader.load_uint32
cpool_size = loader.load_varuint

constant_pool = ConstantPool.new(input, serialized, cpool_base, cpool_size)
constant_pool = ConstantPool.new(serialized, cpool_base, cpool_size)

node = loader.load_node(constant_pool, encoding, freeze) #: ProgramNode
loader.load_constant_pool(constant_pool)
Expand Down Expand Up @@ -202,14 +202,12 @@ module Prism
class ConstantPool # :nodoc:
attr_reader :size #: Integer

# @rbs @input: String
# @rbs @serialized: String
# @rbs @base: Integer
# @rbs @pool: Array[Symbol?]

#: (String input, String serialized, Integer base, Integer size) -> void
def initialize(input, serialized, base, size)
@input = input
#: (String serialized, Integer base, Integer size) -> void
def initialize(serialized, base, size)
@serialized = serialized
@base = base
@size = size
Expand All @@ -224,11 +222,7 @@ module Prism
start = @serialized.unpack1("L", offset: offset) #: Integer
length = @serialized.unpack1("L", offset: offset + 4) #: Integer

if start.nobits?(1 << 31)
(@input.byteslice(start, length) or raise).force_encoding(encoding).to_sym
else
(@serialized.byteslice(start & ((1 << 31) - 1), length) or raise).force_encoding(encoding).to_sym
end
(@serialized.byteslice(start, length) or raise).force_encoding(encoding).to_sym
end
end
end
Expand Down Expand Up @@ -289,8 +283,8 @@ module Prism
trailer = 0

constant_pool.size.times do |index|
start, length = (io.read(8) or raise).unpack("L2") #: [Integer, Integer]
trailer += length if start.anybits?(1 << 31)
length = (io.read(8) or raise).unpack1("L", offset: 4) #: Integer
trailer += length
end

io.read(trailer)
Expand Down Expand Up @@ -388,7 +382,7 @@ module Prism
error =
ParseError.new(
DIAGNOSTIC_TYPES.fetch(load_varuint),
load_embedded_string(encoding),
load_string(encoding),
load_location_object(freeze),
load_error_level
)
Expand Down Expand Up @@ -422,7 +416,7 @@ module Prism
warning =
ParseWarning.new(
DIAGNOSTIC_TYPES.fetch(load_varuint),
load_embedded_string(encoding),
load_string(encoding),
load_location_object(freeze),
load_warning_level
)
Expand Down Expand Up @@ -507,21 +501,9 @@ module Prism
end
end

#: (Encoding encoding) -> String
def load_embedded_string(encoding)
(io.read(load_varuint) or raise).force_encoding(encoding).freeze
end

#: (Encoding encoding) -> String
def load_string(encoding)
case (type = io.getbyte)
when 1
(input.byteslice(load_varuint, load_varuint) or raise).force_encoding(encoding).freeze
when 2
load_embedded_string(encoding)
else
raise "Unknown serialized string type: #{type}"
end
(io.read(load_varuint) or raise).force_encoding(encoding).freeze
end

#: (bool freeze) -> Location
Expand Down
54 changes: 10 additions & 44 deletions templates/src/serialize.c.erb
Original file line number Diff line number Diff line change
Expand Up @@ -26,28 +26,10 @@ pm_serialize_location(const pm_location_t *location, pm_buffer_t *buffer) {
}

static void
pm_serialize_string(const pm_parser_t *parser, const pm_string_t *string, pm_buffer_t *buffer) {
switch (string->type) {
case PM_STRING_SHARED: {
pm_buffer_append_byte(buffer, 1);
pm_buffer_append_varuint(buffer, pm_ptrdifft_to_u32(pm_string_source(string) - parser->start));
pm_buffer_append_varuint(buffer, pm_sizet_to_u32(pm_string_length(string)));
break;
}
case PM_STRING_OWNED:
case PM_STRING_CONSTANT: {
uint32_t length = pm_sizet_to_u32(pm_string_length(string));
pm_buffer_append_byte(buffer, 2);
pm_buffer_append_varuint(buffer, length);
pm_buffer_append_bytes(buffer, pm_string_source(string), length);
break;
}
#ifdef PRISM_HAS_MMAP
case PM_STRING_MAPPED:
assert(false && "Cannot serialize mapped strings.");
break;
#endif
}
pm_serialize_string(const pm_string_t *string, pm_buffer_t *buffer) {
uint32_t length = pm_sizet_to_u32(pm_string_length(string));
pm_buffer_append_varuint(buffer, length);
pm_buffer_append_bytes(buffer, pm_string_source(string), length);
}

static void
Expand Down Expand Up @@ -102,7 +84,7 @@ pm_serialize_node(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) {
pm_serialize_node(parser, (pm_node_t *)((pm_<%= node.human %>_t *)node)-><%= field.name %>, buffer);
}
<%- when Prism::Template::StringField -%>
pm_serialize_string(parser, &((pm_<%= node.human %>_t *)node)-><%= field.name %>, buffer);
pm_serialize_string(&((pm_<%= node.human %>_t *)node)-><%= field.name %>, buffer);
<%- when Prism::Template::NodeListField -%>
uint32_t <%= field.name %>_size = pm_sizet_to_u32(((pm_<%= node.human %>_t *)node)-><%= field.name %>.size);
pm_buffer_append_varuint(buffer, <%= field.name %>_size);
Expand Down Expand Up @@ -304,28 +286,12 @@ pm_serialize_content(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer)
pm_constant_t *constant = &parser->constant_pool.constants[bucket->id - 1];
size_t buffer_offset = offset + ((((size_t)bucket->id) - 1) * 8);

if (bucket->type == PM_CONSTANT_POOL_BUCKET_OWNED || bucket->type == PM_CONSTANT_POOL_BUCKET_CONSTANT) {
// Since this is an owned or constant constant, we are going to
// write its contents into the buffer after the constant pool.
// So effectively in place of the source offset, we have a
// buffer offset. We will add a leading 1 to indicate that this
// is a buffer offset.
uint32_t content_offset = pm_sizet_to_u32(buffer->length);
uint32_t owned_mask = 1U << 31;

assert(content_offset < owned_mask);
content_offset |= owned_mask;

memcpy(buffer->value + buffer_offset, &content_offset, 4);
pm_buffer_append_bytes(buffer, constant->start, constant->length);
} else {
// Since this is a shared constant, we are going to write its
// source offset directly into the buffer.
uint32_t source_offset = pm_ptrdifft_to_u32(constant->start - parser->start);
memcpy(buffer->value + buffer_offset, &source_offset, 4);
}
// Write the constant contents into the buffer after the constant
// pool. In place of the source offset, we store a buffer offset.
uint32_t content_offset = pm_sizet_to_u32(buffer->length);
memcpy(buffer->value + buffer_offset, &content_offset, 4);
pm_buffer_append_bytes(buffer, constant->start, constant->length);

// Now we can write the length of the constant into the buffer.
uint32_t constant_length = pm_sizet_to_u32(constant->length);
memcpy(buffer->value + buffer_offset + 4, &constant_length, 4);
}
Expand Down
Loading