Skip to content

Commit ff4f871

Browse files
benglclaude
andcommitted
ffi: add V8 fast-call path
Add a parallel dispatch path that uses V8 fast API calls instead of libffi for eligible native calls. At DynamicLibrary.getFunction time, register the dlsym'd target function pointer directly with V8 as the C function. Signatures with callbacks, unsupported argument types, or more than 8 arguments fall back to libffi. Patches deps/v8 to add CFunctionInfo::HasReceiver = kNo, which makes TurboFan and Turboshaft fast-call lowering omit the JS receiver from the C call. With no receiver to strip, no per-function JIT'd trampoline is needed — the target's plain C signature matches the CFunctionInfo exactly. The JS wrapper validates each argument per declared type (i32 via `(a|0)===a`, u32 via `(a>>>0)===a`, BigInt range checks for i64/u64), mirroring the libffi slow callback so the contract is identical across both paths and across V8 optimization tiers. The path is gated behind --experimental-ffi and can be disabled at build time with --without-ffi-fastcall. The previous shared-buffer JS fast path is removed, replaced by this fast-call path. Signed-off-by: Bryan English <bryan@bryanenglish.com> Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent a962e72 commit ff4f871

30 files changed

Lines changed: 2097 additions & 1736 deletions

configure.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1059,6 +1059,12 @@
10591059
default=None,
10601060
help='build without FFI (Foreign Function Interface) support')
10611061

1062+
parser.add_argument('--without-ffi-fastcall',
1063+
action='store_true',
1064+
dest='without_ffi_fastcall',
1065+
default=False,
1066+
help='disable the FFI V8-fast-call path; libffi-only')
1067+
10621068
parser.add_argument('--experimental-quic',
10631069
action='store_true',
10641070
dest='experimental_quic',
@@ -2324,6 +2330,19 @@ def bundled_ffi_supported(os_name, target_arch):
23242330

23252331
return target_arch in supported.get(os_name, set())
23262332

2333+
def fastcall_supported(os_name, target_arch):
2334+
supported = {
2335+
'freebsd': {'arm', 'arm64', 'x64'},
2336+
'linux': {'arm', 'arm64', 'x64'},
2337+
'mac': {'arm64', 'x64'},
2338+
'win': {'arm64', 'x64'},
2339+
}
2340+
2341+
if target_arch == 'x86':
2342+
target_arch = 'ia32'
2343+
2344+
return target_arch in supported.get(os_name, set())
2345+
23272346
def configure_ffi(o):
23282347
use_ffi = not options.without_ffi
23292348

@@ -2337,6 +2356,7 @@ def configure_ffi(o):
23372356
use_ffi = False
23382357

23392358
o['variables']['node_use_ffi'] = b(use_ffi)
2359+
o['variables']['node_use_ffi_fastcall'] = b(False)
23402360

23412361
if options.without_ffi:
23422362
if options.shared_ffi:
@@ -2348,6 +2368,11 @@ def configure_ffi(o):
23482368

23492369
configure_library('ffi', o, pkgname='libffi')
23502370

2371+
use_fastcall = use_ffi and not options.without_ffi_fastcall
2372+
if use_fastcall and not fastcall_supported(flavor, o['variables']['target_arch']):
2373+
use_fastcall = False
2374+
o['variables']['node_use_ffi_fastcall'] = b(use_fastcall)
2375+
23512376
def configure_quic(o):
23522377
o['variables']['node_use_quic'] = b(options.experimental_quic and
23532378
not options.without_ssl)

deps/v8/include/v8-fast-api-calls.h

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -308,14 +308,28 @@ class V8_EXPORT CFunctionInfo {
308308
kBigInt = 1, // Use BigInts to represent 64 bit integers.
309309
};
310310

311+
// Whether the C function takes a JS receiver as its first argument.
312+
// Most fast-call C functions do (matching how V8 wires up FunctionTemplate
313+
// callbacks). Embedders that want to register a plain C function pointer
314+
// — e.g. an FFI dispatcher that has no use for the receiver — can set this
315+
// to kNo. In that mode V8 omits the receiver from the C call: arg_info[0]
316+
// is the first user argument, ArgumentCount() returns the user-arg count,
317+
// and the JS receiver value is discarded by the lowering instead of being
318+
// passed in the first parameter register.
319+
enum class HasReceiver : uint8_t {
320+
kYes = 0,
321+
kNo = 1,
322+
};
323+
311324
// Construct a struct to hold a CFunction's type information.
312325
// |return_info| describes the function's return type.
313326
// |arg_info| is an array of |arg_count| CTypeInfos describing the
314327
// arguments. Only the last argument may be of the special type
315328
// CTypeInfo::kCallbackOptionsType.
316329
CFunctionInfo(const CTypeInfo& return_info, unsigned int arg_count,
317330
const CTypeInfo* arg_info,
318-
Int64Representation repr = Int64Representation::kNumber);
331+
Int64Representation repr = Int64Representation::kNumber,
332+
HasReceiver has_receiver = HasReceiver::kYes);
319333

320334
const CTypeInfo& ReturnInfo() const { return return_info_; }
321335

@@ -327,6 +341,8 @@ class V8_EXPORT CFunctionInfo {
327341

328342
Int64Representation GetInt64Representation() const { return repr_; }
329343

344+
bool HasReceiverArg() const { return has_receiver_ == HasReceiver::kYes; }
345+
330346
// |index| must be less than ArgumentCount().
331347
// Note: if the last argument passed on construction of CFunctionInfo
332348
// has type CTypeInfo::kCallbackOptionsType, it is not included in
@@ -342,6 +358,7 @@ class V8_EXPORT CFunctionInfo {
342358
private:
343359
const CTypeInfo return_info_;
344360
const Int64Representation repr_;
361+
const HasReceiver has_receiver_;
345362
const unsigned int arg_count_;
346363
const CTypeInfo* arg_info_;
347364
};

deps/v8/src/api/api.cc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11992,9 +11992,11 @@ CFunction::CFunction(const void* address, const CFunctionInfo* type_info)
1199211992

1199311993
CFunctionInfo::CFunctionInfo(const CTypeInfo& return_info,
1199411994
unsigned int arg_count, const CTypeInfo* arg_info,
11995-
Int64Representation repr)
11995+
Int64Representation repr,
11996+
HasReceiver has_receiver)
1199611997
: return_info_(return_info),
1199711998
repr_(repr),
11999+
has_receiver_(has_receiver),
1199812000
arg_count_(arg_count),
1199912001
arg_info_(arg_info) {
1200012002
DCHECK(repr == Int64Representation::kNumber ||

deps/v8/src/compiler/fast-api-calls.cc

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -385,10 +385,14 @@ FastApiCallFunction GetFastApiCallTarget(
385385
function_template_info.c_signatures(broker);
386386
const size_t overloads_count = signatures.size();
387387

388-
// Only considers entries whose type list length matches arg_count.
388+
// Only considers entries whose type list length matches arg_count. For
389+
// signatures registered with HasReceiver=kNo, the C-side ArgumentCount
390+
// already excludes the receiver, so we don't subtract it here.
389391
for (size_t i = 0; i < overloads_count; i++) {
390392
const CFunctionInfo* c_signature = signatures[i];
391-
const size_t len = c_signature->ArgumentCount() - kReceiver;
393+
const size_t len =
394+
c_signature->ArgumentCount() -
395+
(c_signature->HasReceiverArg() ? kReceiver : 0);
392396
bool optimize_to_fast_call =
393397
(len == arg_count) &&
394398
fast_api_call::CanOptimizeFastSignature(c_signature);

deps/v8/src/compiler/js-call-reducer.cc

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -662,7 +662,9 @@ class FastApiCallReducerAssembler : public JSCallReducerAssembler {
662662
// arguments, so extract c_argument_count from the first function.
663663
const int c_argument_count =
664664
static_cast<int>(c_function_.signature->ArgumentCount());
665-
CHECK_GE(c_argument_count, kReceiver);
665+
if (c_function_.signature->HasReceiverArg()) {
666+
CHECK_GE(c_argument_count, kReceiver);
667+
}
666668

667669
const int slow_arg_count =
668670
// Arguments for CallApiCallbackOptimizedXXX builtin including
@@ -677,11 +679,16 @@ class FastApiCallReducerAssembler : public JSCallReducerAssembler {
677679
base::SmallVector<Node*, kInlineSize> inputs(value_input_count +
678680
kEffectAndControl);
679681
int cursor = 0;
680-
inputs[cursor++] = n.receiver();
682+
const bool has_receiver_arg =
683+
c_function_.signature->HasReceiverArg();
684+
if (has_receiver_arg) {
685+
inputs[cursor++] = n.receiver();
686+
}
681687

682688
// TODO(turbofan): Consider refactoring CFunctionInfo to distinguish
683689
// between receiver and arguments, simplifying this (and related) spots.
684-
int js_args_count = c_argument_count - kReceiver;
690+
int js_args_count =
691+
c_argument_count - (has_receiver_arg ? kReceiver : 0);
685692
for (int i = 0; i < js_args_count; ++i) {
686693
if (i < n.ArgumentCount()) {
687694
inputs[cursor++] = n.Argument(i);
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
# FFI Fast-Call Internals
2+
3+
This document is for contributors who maintain or extend the FFI
4+
fast-call path (the V8 Fast API Calls implementation in `node:ffi`).
5+
For end-user behavior, see [doc/api/ffi.md](../api/ffi.md).
6+
7+
## Overview
8+
9+
For each registered FFI function whose signature is fast-call eligible
10+
(`src/ffi/types.cc:IsFastCallEligible`), Node generates a tiny native
11+
trampoline that strips the `Local<Object>` receiver V8 fast calls
12+
require and tail-calls the user's target function. The trampoline
13+
address is handed to `v8::CFunction`. A JS wrapper
14+
(`lib/internal/ffi-fastcall.js`) validates args, routes object-typed
15+
pointer args to a libffi slow path, and checks a per-library "alive"
16+
sentinel before each call.
17+
18+
The libffi path remains for callbacks (`ffi_prep_closure_loc`),
19+
ineligible signatures (signatures containing the FFI `function` type),
20+
and unsupported platforms.
21+
22+
## Eligibility (`src/ffi/types.cc:IsFastCallEligible`)
23+
24+
A signature is fast-call eligible iff all of:
25+
26+
1. The platform is supported (see Platform Support below).
27+
2. Return type is one of: void, i8/u8/i16/u16, i32/u32, i64/u64,
28+
f32/f64, pointer.
29+
3. Every arg type is in that set.
30+
4. No arg or return is the FFI `function` type.
31+
5. Per-ABI argument caps:
32+
- AArch64: ≤ 7 GP, ≤ 8 FP
33+
- x86_64 SysV: ≤ 6 GP, ≤ 8 FP
34+
- x86_64 Win64: GP + FP combined ≤ 3 (positional register slots — 4 minus the receiver)
35+
- AArch32 hardfp: ≤ 3 GP, ≤ 8 FP; i64/u64 args and return type rejected
36+
37+
`IsFastCallEligible(fn, &reason)` returns false with a static reason
38+
string on miss.
39+
40+
## Platform support
41+
42+
| ABI | Emitter file | Status |
43+
|---|---|---|
44+
| AArch64 (Linux/macOS/FreeBSD/Windows) | `stub_emitter_aarch64.cc` | Implemented, runtime-verified |
45+
| x86_64 SysV (Linux/macOS/FreeBSD) | `stub_emitter_x64_sysv.cc` | Implemented, CI-verified |
46+
| x86_64 Win64 | `stub_emitter_x64_win.cc` | Implemented, CI-verified |
47+
| AArch32 hardfp (Linux/FreeBSD) | `stub_emitter_arm.cc` | Implemented, CI-verified |
48+
49+
On platforms without an emitter, all registrations fall back to libffi.
50+
51+
Adding a new ABI: implement `EmitForwarder` for the new platform in a
52+
new `stub_emitter_<abi>.cc`, gate it via `node.gyp` conditions on
53+
`target_arch` and `OS`, and add the `(os, arch)` pair to
54+
`fastcall_supported` in `configure.py`.
55+
56+
## Stub generation (`src/ffi/fastcall/stub_emitter_*.cc`)
57+
58+
Each stub does, at most, three things:
59+
60+
1. Shift GP regs down by one slot (drop the receiver).
61+
2. (Win64 only) shift FP regs down by one slot — Win64's FP/GP register
62+
slots are positional, so stripping a GP arg also reindexes FP slots.
63+
3. Tail-call the target via an indirect jump.
64+
65+
For SysV ≥ 6 GP args, the stub uses a call+ret pattern with stack
66+
rewrite (because the 7th GP slot lives on the stack). Other ABIs cap
67+
below their stack overflow point in v1 to keep emitters simple.
68+
69+
## JIT memory (`src/ffi/fastcall/jit_memory.cc`)
70+
71+
A process-global singleton on top of platform `mmap`/`VirtualAlloc`.
72+
Allocates 64-byte slot-aligned chunks within page-aligned allocations.
73+
After writing the stub, the page is transitioned to RX via `mprotect` /
74+
`VirtualProtect`; once a page goes RX, no further allocation happens
75+
in it (the bump cursor is locked).
76+
77+
The original spec called for `v8::PageAllocator`, but neither
78+
`Isolate::GetArrayBufferAllocator()->GetPageAllocator()` nor
79+
`Platform::GetPageAllocator()` returns a usable allocator in Node's
80+
embedded configuration — both default to `nullptr`. The implementation
81+
uses direct system calls (with `MAP_JIT` on Apple Silicon) instead.
82+
83+
`Free` decrements the live-byte counter but does not return memory.
84+
Pages stay alive for the process lifetime.
85+
86+
Concurrent emit from multiple isolates is safe via
87+
`JitMemory::EmitStub(code, size)`, which holds the singleton mutex across
88+
allocate + memcpy + RX-transition. The lower-level `Allocate` /
89+
`MakeExecutable` / `Free` methods remain public for the self-test only
90+
(which writes platform-specific instruction bytes after Allocate but
91+
before MakeExecutable, and needs that explicit step ordering).
92+
93+
## Self-test
94+
95+
`JitMemory::SelfTest` allocates a tiny stub, writes a `ret`-style
96+
native sequence, transitions to RX, and calls it. Cached in a
97+
process-wide atomic via `std::call_once`. Run once per process at
98+
first FFI registration. On failure, every subsequent registration
99+
falls back to libffi-only and a process warning is emitted via
100+
`ProcessEmitWarning`.
101+
102+
This catches:
103+
- macOS `MAP_JIT` entitlement missing (e.g., signed binary without
104+
`com.apple.security.cs.allow-jit`).
105+
- Hardened-runtime restrictions.
106+
- SELinux execmem denial.
107+
108+
## JS wrapper (`lib/internal/ffi-fastcall.js`)
109+
110+
For each fast-call-eligible inner v8::Function returned from C++,
111+
`buildWrapper` creates a JS wrapper that:
112+
113+
1. Reads the per-library "alive" `Uint8Array` and throws
114+
`ERR_FFI_LIBRARY_CLOSED` if `[0] !== 0`.
115+
2. Per-arg validation, mirroring `ToFFIArgument` in
116+
`src/ffi/types.cc:ToFFIArgument`. Same `ERR_INVALID_ARG_VALUE`
117+
codes, same messages, same range bounds.
118+
3. Pointer args:
119+
- BigInt or null/undefined: pass through as primitive.
120+
- String / Buffer / ArrayBuffer / ArrayBufferView: `ReflectApply`
121+
the `kFastcallInvokeSlow` libffi-backed v8::Function with the
122+
original args.
123+
4. Calls the inner v8::Function with positional primitives. V8's fast
124+
call engages when TurboFan inlines the wrapper.
125+
126+
The wrapper body is **arity-specialized**: arities 0..6 are unrolled into
127+
distinct closures with named parameters (`function(a0, a1, ...)`), so V8
128+
inlines them and the per-arg type info / pointer flag are read from
129+
closure locals instead of arrays. Arities 7+ use a rest-args fallback. This
130+
matters: an earlier draft used a single generic `function(...args)` plus
131+
`ReflectApply`, which dropped FFI throughput by 30–50% vs. the libffi+SB
132+
path. The arity specialization gets the throughput back to 5–13× the
133+
libffi+SB baseline (see commit `81d908e48da` for the fix and benchmarks).
134+
135+
The wrapper is patched onto `DynamicLibrary.prototype.getFunction`,
136+
`getFunctions`, and the `functions` accessor.
137+
138+
## Internal symbols
139+
140+
The JS wrapper looks for these per-isolate Symbols on the inner
141+
`v8::Function`. They are defined in `src/env_properties.h` and
142+
attached by `DynamicLibrary::CreateFunction` for fast-call-eligible
143+
signatures only:
144+
145+
| Symbol | Value | Purpose |
146+
|---|---|---|
147+
| `kFastcallAlive` | `Uint8Array(1)` shared with `DynamicLibrary` | close sentinel |
148+
| `kFastcallInvokeSlow` | `v8::Function` over `InvokeFunction` | object-arg fallback |
149+
| `kFastcallParams` | `string[]` of parameter type names | wrapper introspection |
150+
| `kFastcallResult` | result type name string | wrapper introspection |
151+
152+
## Lifecycle
153+
154+
**Registration:** `CreateFunction` in `src/node_ffi.cc` builds a
155+
`fastcall::CFunctionInfoBundle` (which owns the heap-allocated
156+
`v8::CFunctionInfo` + `v8::CTypeInfo[]`), allocates and emits the stub via
157+
`JitMemory::EmitStub`, then constructs the inner `v8::Function` via a
158+
`FunctionTemplate` with the `CFunction` attached. Per-function fast-call
159+
state is stored on `FFIFunctionInfo::fast` (a `unique_ptr<FastCallState>`,
160+
null when fast-call is unavailable for that signature).
161+
162+
**Per-call:** wrapper validates → calls inner. V8 picks fast or slow
163+
callback. Slow = `InvokeFunction` (libffi); fast = our generated stub →
164+
target.
165+
166+
**`lib.close()`:** flips the alive sentinel (`alive[0] = 1`). The wrapper
167+
throws `ERR_FFI_LIBRARY_CLOSED` on subsequent calls. Slow-path
168+
`InvokeFunction` independently checks `fn->closed` for the same effect on
169+
ineligible signatures. Stubs are NOT freed at close.
170+
171+
**Weak callback (function GC'd):** `CleanupFunctionInfo` resets
172+
`info->fast`, whose `~FastCallState` destructor calls `JitMemory::Free`
173+
on the stub.
174+
175+
## Testing
176+
177+
- `test/cctest/test_ffi_fastcall_*.cc`: unit tests for emitters, JIT
178+
memory, eligibility, CFunctionInfo builder.
179+
- `test/ffi/test-ffi-*.js`: JS-level integration tests covering
180+
types, arity, callbacks, permissions, etc. (existing FFI suite —
181+
reused as the integration baseline).
182+
183+
When debugging unexpected fast-call behavior, log the eligibility miss
184+
reason via the second arg to `IsFastCallEligible`. Set the
185+
`--without-ffi-fastcall` configure flag to A/B test against the
186+
libffi-only path.

lib/ffi.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ const {
5454
toArrayBuffer,
5555
} = internalBinding('ffi');
5656

57-
require('internal/ffi-shared-buffer');
57+
require('internal/ffi-fastcall');
5858

5959
DynamicLibrary.prototype[SymbolDispose] = function() {
6060
this.close();

lib/internal/errors.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1231,6 +1231,7 @@ E('ERR_FEATURE_UNAVAILABLE_ON_PLATFORM',
12311231
'The feature %s is unavailable on the current platform' +
12321232
', which is being used to run Node.js',
12331233
TypeError);
1234+
E('ERR_FFI_LIBRARY_CLOSED', 'Library is closed', Error);
12341235
E('ERR_FS_CP_DIR_TO_NON_DIR',
12351236
'Cannot overwrite non-directory with directory', SystemError);
12361237
E('ERR_FS_CP_EEXIST', 'Target already exists', SystemError);

0 commit comments

Comments
 (0)