summaryrefslogtreecommitdiffhomepage
path: root/lib
diff options
context:
space:
mode:
authorJo-Philipp Wich <jo@mein.io>2023-08-21 16:33:30 +0200
committerJo-Philipp Wich <jo@mein.io>2023-08-23 14:24:46 +0200
commitc9982deea6ef7b729c4b802560c8e1656b9a910f (patch)
treeb54ac64b8e4c7a77526ad4a6b47fdba015ae7166 /lib
parent7dde4933be3408b6fbddd881681f298163a8318f (diff)
docs: add struct module documentation
Add full documentation coverage for the struct module by utilizing large parts of the Python struct module documentation for the format string description. Signed-off-by: Jo-Philipp Wich <jo@mein.io>
Diffstat (limited to 'lib')
-rw-r--r--lib/struct.c451
1 files changed, 451 insertions, 0 deletions
diff --git a/lib/struct.c b/lib/struct.c
index cf4499a..ad0bc20 100644
--- a/lib/struct.c
+++ b/lib/struct.c
@@ -60,6 +60,306 @@
* - Removed unused code
*/
+/**
+ * # Handle Packed Binary Data
+ *
+ * The `struct` module provides routines for interpreting byte strings as packed
+ * binary data.
+ *
+ * Functions can be individually imported and directly accessed using the
+ * {@link https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/import#named_import named import}
+ * syntax:
+ *
+ * ```
+ * import { pack, unpack } from 'struct';
+ *
+ * let buffer = pack('bhl', -13, 1234, 444555666);
+ * let values = unpack('bhl', buffer);
+ * ```
+ *
+ * Alternatively, the module namespace can be imported
+ * using a wildcard import statement:
+ *
+ * ```
+ * import * as struct from 'struct';
+ *
+ * let buffer = struct.pack('bhl', -13, 1234, 444555666);
+ * let values = struct.unpack('bhl', buffer);
+ * ```
+ *
+ * Additionally, the struct module namespace may also be imported by invoking
+ * the `ucode` interpreter with the `-lstruct` switch.
+ *
+ * ## Format Strings
+ *
+ * Format strings describe the data layout when packing and unpacking data.
+ * They are built up from format-characters, which specify the type of data
+ * being packed/unpacked. In addition, special characters control the byte
+ * order, size and alignment.
+ *
+ * Each format string consists of an optional prefix character which describes
+ * the overall properties of the data and one or more format characters which
+ * describe the actual data values and padding.
+ *
+ * ### Byte Order, Size, and Alignment
+ *
+ * By default, C types are represented in the machine's native format and byte
+ * order, and properly aligned by skipping pad bytes if necessary (according to
+ * the rules used by the C compiler).
+ *
+ * This behavior is chosen so that the bytes of a packed struct correspond
+ * exactly to the memory layout of the corresponding C struct.
+ *
+ * Whether to use native byte ordering and padding or standard formats depends
+ * on the application.
+ *
+ * Alternatively, the first character of the format string can be used to indicate
+ * the byte order, size and alignment of the packed data, according to the
+ * following table:
+ *
+ * | Character | Byte order | Size | Alignment |
+ * |-----------|------------------------|----------|-----------|
+ * | `@` | native | native | native |
+ * | `=` | native | standard | none |
+ * | `<` | little-endian | standard | none |
+ * | `>` | big-endian | standard | none |
+ * | `!` | network (= big-endian) | standard | none |
+ *
+ * If the first character is not one of these, `'@'` is assumed.
+ *
+ * Native byte order is big-endian or little-endian, depending on the
+ * host system. For example, Intel x86, AMD64 (x86-64), and Apple M1 are
+ * little-endian; IBM z and many legacy architectures are big-endian.
+ *
+ * Native size and alignment are determined using the C compiler's
+ * `sizeof` expression. This is always combined with native byte order.
+ *
+ * Standard size depends only on the format character; see the table in
+ * the `format-characters` section.
+ *
+ * Note the difference between `'@'` and `'='`: both use native byte order,
+ * but the size and alignment of the latter is standardized.
+ *
+ * The form `'!'` represents the network byte order which is always big-endian
+ * as defined in `IETF RFC 1700`.
+ *
+ * There is no way to indicate non-native byte order (force byte-swapping); use
+ * the appropriate choice of `'<'` or `'>'`.
+ *
+ * Notes:
+ *
+ * (1) Padding is only automatically added between successive structure members.
+ * No padding is added at the beginning or the end of the encoded struct.
+ *
+ * (2) No padding is added when using non-native size and alignment, e.g.
+ * with '<', '>', '=', and '!'.
+ *
+ * (3) To align the end of a structure to the alignment requirement of a
+ * particular type, end the format with the code for that type with a repeat
+ * count of zero.
+ *
+ *
+ * ### Format Characters
+ *
+ * Format characters have the following meaning; the conversion between C and
+ * ucode values should be obvious given their types. The 'Standard size' column
+ * refers to the size of the packed value in bytes when using standard size;
+ * that is, when the format string starts with one of `'<'`, `'>'`, `'!'` or
+ * `'='`. When using native size, the size of the packed value is platform
+ * dependent.
+ *
+ * | Format | C Type | Ucode type | Standard size | Notes |
+ * |--------|----------------------|------------|----------------|----------|
+ * | `x` | *pad byte* | *no value* | | (7) |
+ * | `c` | `char` | string | 1 | |
+ * | `b` | `signed char` | int | 1 | (1), (2) |
+ * | `B` | `unsigned char` | int | 1 | (2) |
+ * | `?` | `_Bool` | bool | 1 | (1) |
+ * | `h` | `short` | int | 2 | (2) |
+ * | `H` | `unsigned short` | int | 2 | (2) |
+ * | `i` | `int` | int | 4 | (2) |
+ * | `I` | `unsigned int` | int | 4 | (2) |
+ * | `l` | `long` | int | 4 | (2) |
+ * | `L` | `unsigned long` | int | 4 | (2) |
+ * | `q` | `long long` | int | 8 | (2) |
+ * | `Q` | `unsigned long long` | int | 8 | (2) |
+ * | `n` | `ssize_t` | int | | (3) |
+ * | `N` | `size_t` | int | | (3) |
+ * | `e` | (6) | double | 2 | (4) |
+ * | `f` | `float` | double | 4 | (4) |
+ * | `d` | `double` | double | 8 | (4) |
+ * | `s` | `char[]` | double | | (9) |
+ * | `p` | `char[]` | double | | (8) |
+ * | `P` | `void *` | int | | (5) |
+ * | `*` | `char[]` | string | | (10) |
+ *
+ * Notes:
+ *
+ * - (1) The `'?'` conversion code corresponds to the `_Bool` type defined by
+ * C99. If this type is not available, it is simulated using a `char`. In
+ * standard mode, it is always represented by one byte.
+ *
+ * - (2) When attempting to pack a non-integer using any of the integer
+ * conversion codes, this module attempts to convert the given value into an
+ * integer. If the value is not convertible, a type error exception is thrown.
+ *
+ * - (3) The `'n'` and `'N'` conversion codes are only available for the native
+ * size (selected as the default or with the `'@'` byte order character).
+ * For the standard size, you can use whichever of the other integer formats
+ * fits your application.
+ *
+ * - (4) For the `'f'`, `'d'` and `'e'` conversion codes, the packed
+ * representation uses the IEEE 754 binary32, binary64 or binary16 format
+ * (for `'f'`, `'d'` or `'e'` respectively), regardless of the floating-point
+ * format used by the platform.
+ *
+ * - (5) The `'P'` format character is only available for the native byte
+ * ordering (selected as the default or with the `'@'` byte order character).
+ * The byte order character `'='` chooses to use little- or big-endian
+ * ordering based on the host system. The struct module does not interpret
+ * this as native ordering, so the `'P'` format is not available.
+ *
+ * - (6) The IEEE 754 binary16 "half precision" type was introduced in the 2008
+ * revision of the `IEEE 754` standard. It has a sign bit, a 5-bit exponent
+ * and 11-bit precision (with 10 bits explicitly stored), and can represent
+ * numbers between approximately `6.1e-05` and `6.5e+04` at full precision.
+ * This type is not widely supported by C compilers: on a typical machine, an
+ * unsigned short can be used for storage, but not for math operations. See
+ * the Wikipedia page on the `half-precision floating-point format` for more
+ * information.
+ *
+ * - (7) When packing, `'x'` inserts one NUL byte.
+ *
+ * - (8) The `'p'` format character encodes a "Pascal string", meaning a short
+ * variable-length string stored in a *fixed number of bytes*, given by the
+ * count. The first byte stored is the length of the string, or 255,
+ * whichever is smaller. The bytes of the string follow. If the string
+ * passed in to `pack()` is too long (longer than the count minus 1), only
+ * the leading `count-1` bytes of the string are stored. If the string is
+ * shorter than `count-1`, it is padded with null bytes so that exactly count
+ * bytes in all are used. Note that for `unpack()`, the `'p'` format
+ * character consumes `count` bytes, but that the string returned can never
+ * contain more than 255 bytes.
+ *
+ * - (9) For the `'s'` format character, the count is interpreted as the length
+ * of the bytes, not a repeat count like for the other format characters; for
+ * example, `'10s'` means a single 10-byte string mapping to or from a single
+ * ucode byte string, while `'10c'` means 10 separate one byte character
+ * elements (e.g., `cccccccccc`) mapping to or from ten different ucode byte
+ * strings. If a count is not given, it defaults to 1. For packing, the
+ * string is truncated or padded with null bytes as appropriate to make it
+ * fit. For unpacking, the resulting bytes object always has exactly the
+ * specified number of bytes. As a special case, `'0s'` means a single,
+ * empty string (while `'0c'` means 0 characters).
+ *
+ * - (10) The `*` format character serves as wildcard. For `pack()` it will
+ * append the corresponding byte argument string as-is, not applying any
+ * padding or zero filling. When a repeat count is given, that many bytes of
+ * the input byte string argument will be appended at most on `pack()`,
+ * effectively truncating longer input strings. For `unpack()`, the wildcard
+ * format will yield a byte string containing the entire remaining input data
+ * bytes, or - when a repeat count is given - that many bytes of input data
+ * at most.
+ *
+ * A format character may be preceded by an integral repeat count. For example,
+ * the format string `'4h'` means exactly the same as `'hhhh'`.
+ *
+ * Whitespace characters between formats are ignored; a count and its format
+ * must not contain whitespace though.
+ *
+ * When packing a value `x` using one of the integer formats (`'b'`,
+ * `'B'`, `'h'`, `'H'`, `'i'`, `'I'`, `'l'`, `'L'`,
+ * `'q'`, `'Q'`), if `x` is outside the valid range for that format, a type
+ * error exception is raised.
+ *
+ * For the `'?'` format character, the return value is either `true` or `false`.
+ * When packing, the truish result value of the argument is used. Either 0 or 1
+ * in the native or standard bool representation will be packed, and any
+ * non-zero value will be `true` when unpacking.
+ *
+ * ## Examples
+ *
+ * Note:
+ * Native byte order examples (designated by the `'@'` format prefix or
+ * lack of any prefix character) may not match what the reader's
+ * machine produces as
+ * that depends on the platform and compiler.
+ *
+ * Pack and unpack integers of three different sizes, using big endian
+ * ordering:
+ *
+ * ```
+ * import { pack, unpack } from 'struct';
+ *
+ * pack(">bhl", 1, 2, 3); // "\x01\x00\x02\x00\x00\x00\x03"
+ * unpack(">bhl", "\x01\x00\x02\x00\x00\x00\x03"); // [ 1, 2, 3 ]
+ * ```
+ *
+ * Attempt to pack an integer which is too large for the defined field:
+ *
+ * ```bash
+ * $ ucode -lstruct -p 'struct.pack(">h", 99999)'
+ * Type error: Format 'h' requires numeric argument between -32768 and 32767
+ * In [-p argument], line 1, byte 24:
+ *
+ * `struct.pack(">h", 99999)`
+ * Near here -------------^
+ * ```
+ *
+ * Demonstrate the difference between `'s'` and `'c'` format characters:
+ *
+ * ```
+ * import { pack } from 'struct';
+ *
+ * pack("@ccc", "1", "2", "3"); // "123"
+ * pack("@3s", "123"); // "123"
+ * ```
+ *
+ * The ordering of format characters may have an impact on size in native
+ * mode since padding is implicit. In standard mode, the user is
+ * responsible for inserting any desired padding.
+ *
+ * Note in the first `pack()` call below that three NUL bytes were added after
+ * the packed `'#'` to align the following integer on a four-byte boundary.
+ * In this example, the output was produced on a little endian machine:
+ *
+ * ```
+ * import { pack } from 'struct';
+ *
+ * pack("@ci", "#", 0x12131415); // "#\x00\x00\x00\x15\x14\x13\x12"
+ * pack("@ic", 0x12131415, "#"); // "\x15\x14\x13\x12#"
+ * ```
+ *
+ * The following format `'ih0i'` results in two pad bytes being added at the
+ * end, assuming the platform's ints are aligned on 4-byte boundaries:
+ *
+ * ```
+ * import { pack } from 'struct';
+ *
+ * pack("ih0i", 0x01010101, 0x0202); // "\x01\x01\x01\x01\x02\x02\x00\x00"
+ * ```
+ *
+ * Use the wildcard format to extract the remainder of the input data:
+ *
+ * ```
+ * import { unpack } from 'struct';
+ *
+ * unpack("ccc*", "foobarbaz"); // [ "f", "o", "o", "barbaz" ]
+ * unpack("ccc3*", "foobarbaz"); // [ "f", "o", "o", "bar" ]
+ * ```
+ *
+ * Use the wildcard format to pack binary stings as-is into the result data:
+ *
+ * ```
+ * import { pack } from 'struct';
+ *
+ * pack("h*h", 0x0101, "\x02\x00\x03", 0x0404); // "\x01\x01\x02\x00\x03\x04\x04"
+ * pack("c3*c", "a", "foobar", "c"); // "afooc"
+ * ```
+ *
+ * @module struct
+ */
+
#include <ctype.h>
#include <errno.h>
#include <limits.h>
@@ -2397,6 +2697,33 @@ fail:
}
+/**
+ * Pack given values according to specified format.
+ *
+ * The `pack()` function creates a byte string containing the argument values
+ * packed according to the given format string.
+ *
+ * Returns the packed string.
+ *
+ * Raises a runtime exception if a given argument value does not match the
+ * required type of the corresponding format string directive or if and invalid
+ * format string is provided.
+ *
+ * @function module:struct#pack
+ *
+ * @param {string} format
+ * The format string.
+ *
+ * @param {...*} values
+ * Variable number of values to pack.
+ *
+ * @returns {string}
+ *
+ * @example
+ * // Pack the values 1, 2, 3 as three consecutive unsigned int values
+ * // in network byte order.
+ * const data = pack('!III', 1, 2, 3);
+ */
static uc_value_t *
uc_pack(uc_vm_t *vm, size_t nargs)
{
@@ -2416,6 +2743,39 @@ uc_pack(uc_vm_t *vm, size_t nargs)
return res;
}
+/**
+ * Unpack given byte string according to specified format.
+ *
+ * The `unpack()` function interpretes a byte string according to the given
+ * format string and returns the resulting values. If the optional offset
+ * argument is given, unpacking starts from this byte position within the input.
+ * If not specified, the start offset defaults to `0`, the start of the given
+ * input string.
+ *
+ * Returns an array of unpacked values.
+ *
+ * Raises a runtime exception if the format string is invalid or if an invalid
+ * input string or offset value is given.
+ *
+ * @function module:struct#unpack
+ *
+ * @param {string} format
+ * The format string.
+ *
+ * @param {string} input
+ * The input string to unpack.
+ *
+ * @param {number} [offset=0]
+ * The offset within the input string to start unpacking from.
+ *
+ * @returns {array}
+ *
+ * @example
+ * // Unpack three consecutive unsigned int values in network byte order.
+ * const numbers =
+ * unpack('!III', '\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x03');
+ * print(numbers, "\n"); // [ 1, 2, 3 ]
+ */
static uc_value_t *
uc_unpack(uc_vm_t *vm, size_t nargs)
{
@@ -2436,6 +2796,47 @@ uc_unpack(uc_vm_t *vm, size_t nargs)
}
+/**
+ * Represents a struct instance created by `new()`.
+ *
+ * @class module:struct.instance
+ * @hideconstructor
+ *
+ * @see {@link module:struct#new|new()}
+ *
+ * @example
+ *
+ * const fmt = struct.new(…);
+ *
+ * fmt.pack(…);
+ *
+ * const values = fmt.unpack(…);
+ */
+
+/**
+ * Precompile format string.
+ *
+ * The `new()` function precompiles the given format string argument and returns
+ * a `struct` object instance useful for packing and unpacking multiple items
+ * without having to recompute the internal format each time.
+ *
+ * Returns an precompiled struct format instance.
+ *
+ * Raises a runtime exception if the format string is invalid.
+ *
+ * @function module:struct#new
+ *
+ * @param {string} format
+ * The format string.
+ *
+ * @returns {module:struct.instance}
+ *
+ * @example
+ * // Create a format of three consecutive unsigned int values in network byte order.
+ * const fmt = struct.new('!III');
+ * const buf = fmt.pack(1, 2, 3); // "\x00\x00\x00\x01…"
+ * print(fmt.unpack(buf), "\n"); // [ 1, 2, 3 ]
+ */
static uc_value_t *
uc_struct_new(uc_vm_t *vm, size_t nargs)
{
@@ -2458,6 +2859,28 @@ uc_struct_gc(void *ud)
free(state);
}
+/**
+ * Pack given values.
+ *
+ * The `pack()` function creates a byte string containing the argument values
+ * packed according to the given format instance.
+ *
+ * Returns the packed string.
+ *
+ * Raises a runtime exception if a given argument value does not match the
+ * required type of the corresponding format string directive.
+ *
+ * @function module:struct.instance#pack
+ *
+ * @param {...*} values
+ * Variable number of values to pack.
+ *
+ * @returns {string}
+ *
+ * @example
+ * const fmt = struct.new(…);
+ * const data = fmt.pack(…);
+ */
static uc_value_t *
uc_struct_pack(uc_vm_t *vm, size_t nargs)
{
@@ -2469,6 +2892,34 @@ uc_struct_pack(uc_vm_t *vm, size_t nargs)
return uc_pack_common(vm, nargs, *state, 0);
}
+/**
+ * Unpack given byte string.
+ *
+ * The `unpack()` function interpretes a byte string according to the given
+ * format instance and returns the resulting values. If the optional offset
+ * argument is given, unpacking starts from this byte position within the input.
+ * If not specified, the start offset defaults to `0`, the start of the given
+ * input string.
+ *
+ * Returns an array of unpacked values.
+ *
+ * Raises a runtime exception if an invalid input string or offset value is
+ * given.
+ *
+ * @function module:struct.instance#unpack
+ *
+ * @param {string} input
+ * The input string to unpack.
+ *
+ * @param {number} [offset=0]
+ * The offset within the input string to start unpacking from.
+ *
+ * @returns {array}
+ *
+ * @example
+ * const fmt = struct.new(…);
+ * const values = fmt.unpack(…);
+ */
static uc_value_t *
uc_struct_unpack(uc_vm_t *vm, size_t nargs)
{