From 10b9d69870d167c0b7185824d0183fd314a3bb1f Mon Sep 17 00:00:00 2001 From: Trung Date: Thu, 2 Jul 2026 15:00:54 +0700 Subject: [PATCH] Direct shebang execution Implemented a recursive shebang parsing loop in the standalone loader's entry point. It reads shebang interpreter lines, including optional arguments like #!/bin/sh -x, updates elf_path to the interpreter, and prepends them to guest_argv while resolving the host path recursively up to a maximum recursion depth of five interpreter levels. --- Makefile | 7 ++ mk/tests.mk | 7 ++ src/core/elf.c | 86 ++++++++++++++++++++++ src/core/elf.h | 20 +++++ src/main.c | 123 +++++++++++++++++++++++++++++-- src/syscall/exec.c | 91 ++++++++--------------- tests/test-shebang-host.c | 151 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 416 insertions(+), 69 deletions(-) create mode 100644 tests/test-shebang-host.c diff --git a/Makefile b/Makefile index 5b77680..455a965 100644 --- a/Makefile +++ b/Makefile @@ -167,6 +167,13 @@ $(BUILD_DIR)/test-proctitle-host: $(BUILD_DIR)/test-proctitle-host.o \ @echo " LD $@" $(Q)$(CC) $(CFLAGS) -o $@ $^ +## Build the shebang parsing host test (native macOS binary) +$(BUILD_DIR)/test-shebang-host: $(BUILD_DIR)/test-shebang-host.o \ + $(BUILD_DIR)/core/elf.o | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ + + # Guest test binaries (cross-compiled, aarch64-linux) # Only used when GUEST_TEST_BINARIES is not set. diff --git a/mk/tests.mk b/mk/tests.mk index b4fa1c4..5a7042b 100644 --- a/mk/tests.mk +++ b/mk/tests.mk @@ -47,6 +47,8 @@ check: $(ELFUSE_BIN) $(TEST_DEPS) check-syscall-coverage \ @$(BUILD_DIR)/test-fork-ipc-protocol-host @printf "\n$(BLUE)━━━ identity override unit test ━━━$(RESET)\n" @$(BUILD_DIR)/test-identity-override-host + @printf "\n$(BLUE)━━━ shebang parser unit test ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-shebang-host @printf "\n$(BLUE)━━━ proctitle argv-tail regression ━━━$(RESET)\n" @$(MAKE) --no-print-directory test-proctitle-host @printf "\n$(BLUE)━━━ proctitle low-stack regression ━━━$(RESET)\n" @@ -584,3 +586,8 @@ test-fork-ipc-protocol-host: $(BUILD_DIR)/test-fork-ipc-protocol-host ## Run the deterministic argv-tail overshoot guard test test-proctitle-host: $(BUILD_DIR)/test-proctitle-host $(BUILD_DIR)/test-proctitle-host + +# Shebang parser unit test +## Run shebang parsing unit tests +test-shebang-host: $(BUILD_DIR)/test-shebang-host + $(BUILD_DIR)/test-shebang-host diff --git a/src/core/elf.c b/src/core/elf.c index 575e1c7..e40d56e 100644 --- a/src/core/elf.c +++ b/src/core/elf.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "core/elf.h" #include "debug/log.h" @@ -427,3 +428,88 @@ void elf_resolve_interp(const char *sysroot, /* Strategy 3: use interp_path as-is */ str_copy_trunc(out, interp_path, out_sz); } + +int elf_read_shebang(const char *host_path, + char *interp_out, + size_t interp_sz, + char *arg_out, + size_t arg_sz) +{ + int fd = open(host_path, O_RDONLY); + if (fd < 0) + return -errno; + + char buf[512]; + ssize_t nread = read(fd, buf, sizeof(buf) - 1); + close_keep_errno(fd); + + if (nread < 0) { + return -errno; + } + if (nread < 2 || buf[0] != '#' || buf[1] != '!') { + return 0; /* Not a shebang script */ + } + buf[nread] = '\0'; + + /* Ignore script bytes after the first line (find \n or \r). If the shebang + * line is longer than our 511-byte buffer (no EOL found but buffer is + * full), reject it. + */ + char *eol = strpbrk(buf + 2, "\r\n"); + if (!eol) { + if (nread == (ssize_t) (sizeof(buf) - 1)) { + return -ENOEXEC; /* Shebang line too long */ + } + } else { + *eol = '\0'; + } + + char *ptr = buf + 2; + while (*ptr == ' ' || *ptr == '\t') { + ptr++; + } + + /* Strip trailing whitespace/newlines of the whole shebang line */ + size_t len = strlen(ptr); + while (len > 0 && (ptr[len - 1] == ' ' || ptr[len - 1] == '\t' || + ptr[len - 1] == '\r' || ptr[len - 1] == '\n')) { + ptr[--len] = '\0'; + } + + if (len == 0) { + return -ENOEXEC; /* Empty shebang interpreter */ + } + + /* Parse interpreter path and single optional argument */ + char *interp = ptr; + char *space = strpbrk(ptr, " \t"); + char *arg = NULL; + if (space) { + *space = '\0'; + arg = space + 1; + /* Strip leading space of the argument */ + while (*arg == ' ' || *arg == '\t') { + arg++; + } + /* Strip trailing space/newlines/tabs of the argument */ + size_t arg_len = strlen(arg); + while (arg_len > 0 && + (arg[arg_len - 1] == ' ' || arg[arg_len - 1] == '\t' || + arg[arg_len - 1] == '\r' || arg[arg_len - 1] == '\n')) { + arg[--arg_len] = '\0'; + } + if (strlen(arg) == 0) { + arg = NULL; + } + } + + if (str_copy_trunc(interp_out, interp, interp_sz) >= interp_sz) { + return -ENOEXEC; /* Buffer too small */ + } + + if (str_copy_trunc(arg_out, arg ? arg : "", arg_sz) >= arg_sz) { + return -ENOEXEC; /* Buffer too small */ + } + + return 1; /* Successfully parsed shebang */ +} diff --git a/src/core/elf.h b/src/core/elf.h index a8ce7ce..0695cdb 100644 --- a/src/core/elf.h +++ b/src/core/elf.h @@ -136,6 +136,26 @@ void elf_resolve_interp(const char *sysroot, char *out, size_t out_sz); +/* Read, probe, and parse a shebang script header from host_path. + * Writes interpreter path to interp_out and the single optional argument + * (if present) to arg_out. arg_out will be set to an empty string if there + * is no optional argument. + * + * Supports LF (\n), CRLF (\r\n), and CR (\r) line endings. If the shebang + * line is not terminated within the 511-byte buffer limit, returns -ENOEXEC. + * + * Returns: + * 1 if a shebang script was successfully parsed + * 0 if the file is not a shebang script + * Negative errno on failure (e.g. -ENOENT, -ENOEXEC, or insufficient + * buffer size) + */ +int elf_read_shebang(const char *host_path, + char *interp_out, + size_t interp_sz, + char *arg_out, + size_t arg_sz); + /* Translate ELF program-header flags (PF_R=4, PF_W=2, PF_X=1) into the * R=1/W=2/X=4 bitset shared by both MEM_PERM_R/W/X (page-table permissions) and * LINUX_PROT_READ/WRITE/EXEC (mmap prot bits). diff --git a/src/main.c b/src/main.c index b36b4f1..159cbea 100644 --- a/src/main.c +++ b/src/main.c @@ -453,16 +453,125 @@ int main(int argc, char **argv) } proc_set_sysroot(sysroot); - if (resolve_guest_elf_host_path(elf_path, elf_host_path, - sizeof(elf_host_path), - &elf_host_temp) < 0) { - log_error("failed to resolve ELF path %s: %s", elf_path, - strerror(errno)); + + int shebang_depth = 0; + const int max_shebang_depth = 5; + + while (shebang_depth < max_shebang_depth) { + if (resolve_guest_elf_host_path(elf_path, elf_host_path, + sizeof(elf_host_path), + &elf_host_temp) < 0) { + log_error("failed to resolve ELF path %s: %s", elf_path, + strerror(errno)); + cleanup_main_resources(&g, guest_initialized, &sysroot_mount, + have_host_cwd ? host_cwd : NULL, guest_argv, + guest_argc, elf_path, sysroot_path); + if (elf_host_temp) + unlink(elf_host_path); + return 1; + } + + /* Check if the file starts with "#!" */ + char interp[LINUX_PATH_MAX]; + char arg[LINUX_PATH_MAX]; + int rc = elf_read_shebang(elf_host_path, interp, sizeof(interp), arg, + sizeof(arg)); + if (rc == 0) { + /* Not a shebang script, proceed to boot */ + break; + } + + if (rc < 0) { + log_error("empty or invalid shebang interpreter in %s", elf_path); + cleanup_main_resources(&g, guest_initialized, &sysroot_mount, + have_host_cwd ? host_cwd : NULL, guest_argv, + guest_argc, elf_path, sysroot_path); + if (elf_host_temp) + unlink(elf_host_path); + return 1; + } + + shebang_depth++; + + /* Prepend interpreter (and argument if present) to guest_argv */ + bool has_arg = (arg[0] != '\0'); + int add_count = has_arg ? 2 : 1; + int new_argc = guest_argc + add_count; + const char **new_argv = + (const char **) calloc((size_t) new_argc, sizeof(char *)); + if (!new_argv) { + log_error("out of memory"); + cleanup_main_resources(&g, guest_initialized, &sysroot_mount, + have_host_cwd ? host_cwd : NULL, guest_argv, + guest_argc, elf_path, sysroot_path); + if (elf_host_temp) + unlink(elf_host_path); + return 1; + } + + new_argv[0] = strdup(interp); + if (!new_argv[0]) { + log_error("out of memory"); + free((void *) new_argv); + cleanup_main_resources(&g, guest_initialized, &sysroot_mount, + have_host_cwd ? host_cwd : NULL, guest_argv, + guest_argc, elf_path, sysroot_path); + if (elf_host_temp) + unlink(elf_host_path); + return 1; + } + if (has_arg) { + new_argv[1] = strdup(arg); + if (!new_argv[1]) { + log_error("out of memory"); + free((void *) new_argv[0]); + free((void *) new_argv); + cleanup_main_resources(&g, guest_initialized, &sysroot_mount, + have_host_cwd ? host_cwd : NULL, + guest_argv, guest_argc, elf_path, + sysroot_path); + if (elf_host_temp) + unlink(elf_host_path); + return 1; + } + } + + /* Transfer ownership of the previous guest_argv elements */ + for (int i = 0; i < guest_argc; i++) { + new_argv[i + add_count] = guest_argv[i]; + } + + free((void *) guest_argv); + guest_argv = new_argv; + guest_argc = new_argc; + + /* Update elf_path to point to the interpreter path */ + char *new_elf_path = strdup(interp); + if (!new_elf_path) { + log_error("out of memory"); + cleanup_main_resources(&g, guest_initialized, &sysroot_mount, + have_host_cwd ? host_cwd : NULL, guest_argv, + guest_argc, elf_path, sysroot_path); + if (elf_host_temp) + unlink(elf_host_path); + return 1; + } + free(elf_path); + elf_path = new_elf_path; + + /* Clean up any materialized temp file before resolving the next path */ + if (elf_host_temp) { + unlink(elf_host_path); + elf_host_temp = false; + } + } + + if (shebang_depth >= max_shebang_depth) { + log_error("too many levels of shebang recursion (max %d) resolving %s", + max_shebang_depth, argv[arg_start]); cleanup_main_resources(&g, guest_initialized, &sysroot_mount, have_host_cwd ? host_cwd : NULL, guest_argv, guest_argc, elf_path, sysroot_path); - if (elf_host_temp) - unlink(elf_host_path); return 1; } diff --git a/src/syscall/exec.c b/src/syscall/exec.c index ef57f7f..b25653f 100644 --- a/src/syscall/exec.c +++ b/src/syscall/exec.c @@ -328,86 +328,54 @@ int64_t sys_execve(hv_vcpu_t vcpu, * binfmt_script. */ elf_info_t elf_info; - if (elf_load(path_host, &elf_info) < 0) { - /* Not a valid ELF. Check if it's a script with a shebang line. Read the - * first 256 bytes and look for "#!" at the start. - */ - int script_fd = open(path_host, O_RDONLY); - if (script_fd < 0) { - err = -LINUX_ENOENT; + int shebang_depth = 0; + const int max_shebang_depth = 5; + + while (elf_load(path_host, &elf_info) < 0) { + if (shebang_depth >= max_shebang_depth) { + err = -LINUX_ELOOP; goto fail; } - char shebang_buf[256]; - ssize_t nread = read(script_fd, shebang_buf, sizeof(shebang_buf) - 1); - close(script_fd); - if (nread < 2 || shebang_buf[0] != '#' || shebang_buf[1] != '!') { - err = -LINUX_ENOEXEC; + char interp_start[256]; + char interp_arg[256]; + int rc = elf_read_shebang(path_host, interp_start, sizeof(interp_start), + interp_arg, sizeof(interp_arg)); + if (rc < 0) { + errno = -rc; + err = linux_errno(); goto fail; } - shebang_buf[nread] = '\0'; - - /* Ignore script bytes after the first line; only the shebang line - * contributes interpreter arguments. - */ - char *eol = strchr(shebang_buf + 2, '\n'); - if (eol) - *eol = '\0'; - - /* Parse interpreter path and optional argument. Format: "#! - * /path/to/interpreter [optional-arg]" - */ - char *interp_start = shebang_buf + 2; - while (*interp_start == ' ' || *interp_start == '\t') - interp_start++; - if (*interp_start == '\0') { + if (rc == 0) { err = -LINUX_ENOEXEC; goto fail; } - /* Linux preserves one optional shebang argument as a single argv - * element, without shell-style splitting. - */ - char *interp_arg = NULL; - char *space = interp_start; - while (*space && *space != ' ' && *space != '\t') - space++; - if (*space) { - *space = '\0'; - interp_arg = space + 1; - while (*interp_arg == ' ' || *interp_arg == '\t') - interp_arg++; - if (*interp_arg == '\0') - interp_arg = NULL; - /* Trim the line ending from the optional argument. */ - if (interp_arg) { - char *end = interp_arg + strlen(interp_arg) - 1; - while (end > interp_arg && - (*end == ' ' || *end == '\t' || *end == '\r')) - *end-- = '\0'; - } - } + shebang_depth++; - log_debug("execve: shebang interp=\"%s\" arg=\"%s\" script=\"%s\"", - interp_start, interp_arg ? interp_arg : "(none)", path); + bool has_arg = (interp_arg[0] != '\0'); + + log_debug( + "execve: shebang interp=\"%s\" arg=\"%s\" script=\"%s\" depth=%d", + interp_start, has_arg ? interp_arg : "(none)", path, shebang_depth); /* Rebuild argv: [interpreter, optional-arg, script-path, * original-argv[1:]] */ - int new_argc = 1 + (interp_arg ? 1 : 0) + 1 + (argc > 1 ? argc - 1 : 0); - if (new_argc > MAX_ARGS) { + int prefix = (has_arg ? 2 : 1) + 1; + if (argc > MAX_ARGS - prefix + 1) { err = -LINUX_E2BIG; goto fail; } + int new_argc = argc - 1 + prefix; /* Use a fixed-size stack array (MAX_ARGS+3 covers interpreter + - * optional arg + script + original argv[1:]). The alloca was - * unnecessary since the bound is compile-time known. + * optional arg + script + original argv[1:]). */ char *new_argv[MAX_ARGS + 3]; int ni = 0; new_argv[ni++] = interp_start; - if (interp_arg) + if (has_arg) new_argv[ni++] = interp_arg; new_argv[ni++] = path; for (int i = 1; i < argc; i++) @@ -451,6 +419,10 @@ int64_t sys_execve(hv_vcpu_t vcpu, unlink(path_host_buf); path_host_temp = false; } + if (interp_host_temp) { + unlink(interp_host_buf); + interp_host_temp = false; + } if (interp_tx.fuse_path) { err = fuse_materialize_path(interp_tx.intercept_path, interp_host_buf, @@ -464,11 +436,6 @@ int64_t sys_execve(hv_vcpu_t vcpu, sizeof(path_host_buf)); path_host = path_host_buf; } - - if (elf_load(path_host, &elf_info) < 0) { - err = -LINUX_ENOENT; - goto fail; - } } /* Pre-PNR validation. All checks that can fail gracefully MUST happen diff --git a/tests/test-shebang-host.c b/tests/test-shebang-host.c new file mode 100644 index 0000000..dca5692 --- /dev/null +++ b/tests/test-shebang-host.c @@ -0,0 +1,151 @@ +/* Native-host unit test for shebang parsing. + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "core/elf.h" + +#include "debug/log.h" + +/* Dummy log implementation to avoid linking debug/log.o */ +void log_impl(int level, const char *file, int line, const char *fmt, ...) +{ + (void) level; + (void) file; + (void) line; + va_list ap; + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + fprintf(stderr, "\n"); +} + +static char *write_temp_file(const char *content, size_t len) +{ + char template[] = "/tmp/elfuse-shebang-test-XXXXXX"; + int fd = mkstemp(template); + if (fd < 0) { + perror("mkstemp"); + exit(1); + } + if (write(fd, content, len) != (ssize_t) len) { + perror("write"); + close(fd); + exit(1); + } + close(fd); + return strdup(template); +} + +static void test_case(const char *content, + size_t content_len, + int expected_rc, + const char *expected_interp, + const char *expected_arg) +{ + char *path = write_temp_file(content, content_len); + char interp[256] = {0}; + char arg[256] = {0}; + + int rc = elf_read_shebang(path, interp, sizeof(interp), arg, sizeof(arg)); + unlink(path); + free(path); + + if (rc != expected_rc) { + fprintf(stderr, "FAIL: expected rc %d, got %d for content: '", + expected_rc, rc); + for (size_t i = 0; i < content_len && i < 20; i++) { + if (content[i] == '\r') + fprintf(stderr, "\\r"); + else if (content[i] == '\n') + fprintf(stderr, "\\n"); + else + fputc(content[i], stderr); + } + fprintf(stderr, "'\n"); + exit(1); + } + + if (rc == 1) { + if (strcmp(interp, expected_interp) != 0) { + fprintf(stderr, "FAIL: expected interp '%s', got '%s'\n", + expected_interp, interp); + exit(1); + } + if (strcmp(arg, expected_arg) != 0) { + fprintf(stderr, "FAIL: expected arg '%s', got '%s'\n", expected_arg, + arg); + exit(1); + } + } +} + +int main(void) +{ + printf("Running shebang parsing unit tests...\n"); + + /* 1. LF line ending */ + const char case1[] = "#! /bin/sh -x\nline2\n"; + test_case(case1, sizeof(case1) - 1, 1, "/bin/sh", "-x"); + + /* 2. CRLF line ending */ + const char case2[] = "#!/usr/bin/env python\r\nline2\n"; + test_case(case2, sizeof(case2) - 1, 1, "/usr/bin/env", "python"); + + /* 3. CR line ending */ + const char case3[] = "#!/bin/bash -e\rline2\n"; + test_case(case3, sizeof(case3) - 1, 1, "/bin/bash", "-e"); + + /* 4. No trailing newline (EOF) */ + const char case4[] = "#!/bin/sh"; + test_case(case4, sizeof(case4) - 1, 1, "/bin/sh", ""); + + /* 5. Blank/empty interpreter */ + const char case5[] = "#! \n"; + test_case(case5, sizeof(case5) - 1, -ENOEXEC, "", ""); + + /* 6. Not a shebang script */ + const char case6[] = "echo hello\n"; + test_case(case6, sizeof(case6) - 1, 0, "", ""); + + /* 7. File too short */ + const char case7[] = "#"; + test_case(case7, sizeof(case7) - 1, 0, "", ""); + + /* 8. Over-long/unterminated shebang line (exactly 511 bytes of 'a' without + * EOL) */ + char case8[512]; + case8[0] = '#'; + case8[1] = '!'; + for (int i = 2; i < 511; i++) { + case8[i] = 'a'; + } + case8[511] = '\0'; + test_case(case8, 511, -ENOEXEC, "", ""); + + /* 9. Long shebang line that IS terminated within 511 bytes */ + char case9[512]; + case9[0] = '#'; + case9[1] = '!'; + for (int i = 2; i < 510; i++) { + case9[i] = 'a'; + } + case9[510] = '\n'; + case9[511] = '\0'; + /* Since the interpreter is 508 characters, and our interp buffer is only + * 256, it should return -ENOEXEC (buffer too small) + */ + test_case(case9, 511, -ENOEXEC, "", ""); + + printf("test-shebang-host: PASS\n"); + return 0; +}