Line data Source code
1 : #ifndef HEADER_fd_src_flamenco_vm_fd_vm_h
2 : #define HEADER_fd_src_flamenco_vm_fd_vm_h
3 :
4 : #include "fd_vm_base.h"
5 : #include "../../ballet/sha256/fd_sha256.h"
6 :
7 : /* A fd_vm_t is an opaque handle of a virtual machine that can execute
8 : sBPF programs. */
9 :
10 : struct fd_vm;
11 : typedef struct fd_vm fd_vm_t;
12 :
13 : /**********************************************************************/
14 : /* FIXME: MOVE TO FD_VM_PRIVATE WHEN CONSTRUCTORS READY */
15 :
16 : /* A fd_vm_shadow_t holds stack frame information not accessible from
17 : within a program. */
18 :
19 : struct fd_vm_shadow { ulong r6; ulong r7; ulong r8; ulong r9; ulong r10; ulong pc; };
20 : typedef struct fd_vm_shadow fd_vm_shadow_t;
21 :
22 : /* fd_vm_input_region_t holds information about fragmented memory regions
23 : within the larger input region. */
24 :
25 : struct __attribute__((aligned(8UL))) fd_vm_input_region {
26 : ulong vaddr_offset; /* Represents offset from the start of the input region. */
27 : ulong haddr; /* Host address corresponding to the start of the mem region. */
28 : uint region_sz; /* Size of the memory region. */
29 : ulong address_space_reserved; /* The amount of address space reserved for the region. */
30 : uchar is_writable; /* If the region can be written to or is read-only */
31 : ulong acc_region_meta_idx; /* Index of the acc_region_meta_t struct for the account corresponding to this region. */
32 : };
33 : typedef struct fd_vm_input_region fd_vm_input_region_t;
34 :
35 : /* fd_vm_acc_region_meta_t holds metadata about a given account. An array of these
36 : structs will map an instruction account index to its respective input memory
37 : region location. */
38 :
39 : struct __attribute((aligned(8UL))) fd_vm_acc_region_meta {
40 : uint region_idx;
41 : /* FIXME: We can get rid of this field once DM is activated. This is
42 : only a hack to make the non-DM code path happy. When DM is
43 : activated, we could query the input_mem_region array for the
44 : original data len. */
45 : ulong original_data_len;
46 : /* The transaction account corresponding to this account. */
47 : fd_account_meta_t * meta;
48 :
49 : /* The expected virtual addresses of the serialized pubkey, lamports, owner,
50 : and data for this account in VM address space.
51 : Used for CPI security checks. */
52 : ulong vm_addr;
53 : ulong vm_key_addr;
54 : ulong vm_lamports_addr;
55 : ulong vm_owner_addr;
56 : ulong vm_data_addr;
57 : };
58 : typedef struct fd_vm_acc_region_meta fd_vm_acc_region_meta_t;
59 :
60 : /* In Agave, all the regions are 16-byte aligned in host address space. There is then an alignment check
61 : which is done inside each syscall memory translation, checking if the data is aligned in host address
62 : space. This is a layering violation, as it leaks the host address layout into the consensus model.
63 :
64 : In the future we will change this alignment check in the vm to purely operate on the virtual address space,
65 : taking advantage of the fact that Agave regions are known to be aligned. For now, we align our regions to
66 : either 8 or 16 bytes, as there are no 16-byte alignment translations in the syscalls currently:
67 : stack: 16 byte aligned
68 : heap: 16 byte aligned
69 : input: 8 byte aligned
70 : rodata: 8 byte aligned
71 :
72 : https://github.com/solana-labs/rbpf/blob/cd19a25c17ec474e6fa01a3cc3efa325f44cd111/src/ebpf.rs#L39-L40 */
73 11181 : #define FD_VM_HOST_REGION_ALIGN (16UL)
74 :
75 : struct __attribute__((aligned(FD_VM_HOST_REGION_ALIGN))) fd_vm {
76 :
77 : /* VM configuration */
78 :
79 : /* FIXME: suspect these three should be replaced by some kind of VM
80 : enabled feature struct (though syscalls do seem to make additional
81 : non-trivial use of instr_ctx). */
82 :
83 : fd_exec_instr_ctx_t * instr_ctx; /* FIXME: DOCUMENT */
84 :
85 : /* FIXME: frame_max should be run time configurable by compute budget.
86 : If there is no reasonable upper bound on this, shadow and stack
87 : will need to be provided by users. */
88 :
89 : //ulong frame_max; /* Maximum number of stack frames, in [0,FD_VM_STACK_FRAME_MAX] */
90 : ulong heap_max; /* Maximum amount of heap in bytes, in [0,FD_VM_HEAP_MAX] */
91 : ulong entry_cu; /* Initial number of compute units for this program, in [0,FD_VM_COMPUTE_UNIT_LIMIT] */
92 :
93 : /* FIXME: The below are practically an exact match to the
94 : fields of an fd_sbpf_program_t (sans ELF info) */
95 :
96 : uchar const * rodata; /* Program read only data, indexed [0,rodata_sz), aligned 8 */
97 : ulong rodata_sz; /* Program read only data size in bytes, FIXME: BOUNDS? */
98 : ulong const * text; /* Program sBPF words, indexed [0,text_cnt), aligned 8 */
99 : ulong text_cnt; /* Program sBPF word count, all text words are inside the rodata */
100 : ulong text_off; /* CALLX virtual address offset in bytes (NOT words).
101 : SBPF V0-V2: ==(ulong)text - (ulong)rodata (file offset of text within ELF).
102 : SBPF V3: ==0 (bytecode starts at vaddr 0x100000000 exactly) */
103 : ulong text_sz; /* Program sBPF size in bytes, == text_cnt*8 */
104 :
105 : ulong entry_pc; /* Initial program counter, in [0,text_cnt)
106 : FIXME: MAKE SURE NOT INTO MW INSTRUCTION, MAKE SURE VALID CALLDEST? */
107 : ulong const * calldests; /* Bit vector of local functions that can be called into, bit indexed in [0,text_cnt) */
108 : /* FIXME: ADD BIT VECTOR OF FORBIDDEN BRANCH TARGETS (E.G.
109 : INTO THE MIDDLE OF A MULTIWORD INSTRUCTION) */
110 :
111 : fd_sbpf_syscalls_t const * syscalls; /* The map of syscalls (sharable over multiple concurrently running vm) */
112 :
113 : fd_vm_trace_t * trace; /* Location to stream traces (no tracing if NULL) */
114 :
115 : /* VM execution and syscall state */
116 :
117 : /* These are used to communicate the execution and syscall state to
118 : users and syscalls. These are initialized based on the above when
119 : a program starts executing. When program halts or faults, these
120 : provide precise execution diagnostics to the user (and potential
121 : breakpoint/continue functionality in the future). When the vm
122 : makes a syscall, the vm will set these precisely and, when a
123 : syscall returns, the vm will update its internal execution state
124 : appropriately. */
125 :
126 : /* IMPORTANT SAFETY TIP! THE BEHAVIOR OF THE SYSCALL ALLOCATOR FOR
127 : HEAP_SZ MUST EXACTLY MATCH THE SOLANA VALIDATOR ALLOCATOR:
128 :
129 : https://github.com/solana-labs/solana/blob/v1.17.23/program-runtime/src/invoke_context.rs#L122-L148
130 :
131 : BIT-FOR-BIT AND BUG-FOR-BUG. SEE THE SYSCALL_ALLOC_FREE FOR MORE
132 : DETAILS. */
133 :
134 : ulong pc; /* The current instruction, in [0,text_cnt) in normal execution, may be out of bounds in a fault */
135 : ulong ic; /* The number of instructions which have been executed */
136 : ulong cu; /* The remaining CUs left for the transaction, positive in normal execution, may be zero in a fault */
137 : ulong frame_cnt; /* The current number of stack frames pushed, in [0,frame_max] */
138 :
139 : ulong heap_sz; /* Heap size in bytes, in [0,heap_max] */
140 :
141 : /* VM memory */
142 :
143 : /* The vm classifies the 64-bit vm address space into 6 regions:
144 :
145 : 0 - unmapped lo
146 : 1 - program -> [FD_VM_MEM_MAP_PROGRAM_REGION_START,FD_VM_MEM_MAP_PROGRAM_REGION_START+4GiB)
147 : 2 - stack -> [FD_VM_MEM_MAP_STACK_REGION_START, FD_VM_MEM_MAP_STACK_REGION_START +4GiB)
148 : 3 - heap -> [FD_VM_MEM_MAP_HEAP_REGION_START, FD_VM_MEM_MAP_HEAP_REGION_START +4GiB)
149 : 4 - input -> [FD_VM_MEM_MAP_INPUT_REGION_START, FD_VM_MEM_MAP_INPUT_REGION_START +4GiB)
150 : 5 - unmapped hi
151 :
152 : These mappings are encoded in a software TLB consisting of three
153 : 6-element arrays: region_haddr, region_ld_sz and region_st_sz.
154 :
155 : region_haddr[i] gives the location in host address space of the
156 : first byte in region i. region_{ld,st}_sz[i] gives the number of
157 : mappable bytes in this region for {loads,stores}. Note that
158 : region_{ld,st}_sz[i]<2^32. Further note that
159 : [region_haddr[i],region_haddr[i]+region_{ld,st}_sz[i]) does not
160 : wrap around in host address space and does not overlap with any
161 : other usages.
162 :
163 : region_{ld,st}_sz[0] and region_{ld,st}_sz[5] are zero such that
164 : requests to access data from a positive sz range in these regions
165 : will fail, making regions 0 and 5 unreadable and unwritable. As
166 : such, region_haddr[0] and region_haddr[5] are arbitrary; NULL is
167 : used as the obvious default.
168 :
169 : region_st_sz[1] is also zero such that requests to store data to
170 : any positive sz range in this region will fail, making region 1
171 : unwritable.
172 :
173 : When the direct mapping feature is enabled, the input region will
174 : no longer be a contigious buffer of host memory. Instead
175 : it will compose of several fragmented regions of memory each with
176 : its own read/write privleges and size. Address translation to the
177 : input region will now have to rely on a binary search lookup of the
178 : start of the appropriate area of physical memory. It also involves
179 : doing a check against if the region can be written to. */
180 :
181 : /* FIXME: If accessing memory beyond the end of the current heap
182 : region is not allowed, sol_alloc_free will need to update the tlb
183 : arrays during program execution (this is trivial). At the same
184 : time, given sol_alloc_free is deprecated, this is unlikely to be
185 : the case. */
186 :
187 : ulong region_haddr[6];
188 : uint region_ld_sz[6];
189 : uint region_st_sz[6];
190 :
191 : /* fd_vm_input_region_t and fd_vm_acc_to_mem arrays are passed in by the bpf
192 : loaders into fd_vm_init.
193 : TODO: It might make more sense to allocate space for these in the VM. */
194 : fd_vm_input_region_t * input_mem_regions; /* An array of input mem regions represent the input region.
195 : The virtual addresses of each region are contigiuous and
196 : strictly increasing. */
197 : uint input_mem_regions_cnt;
198 : fd_vm_acc_region_meta_t * acc_region_metas; /* Represents a mapping from the instruction account indicies
199 : from the instruction context to the input memory region index
200 : of the account's data region in the input space. */
201 : uchar is_deprecated; /* The vm requires additional checks in certain CPIs if the
202 : vm's current instance was initialized by a deprecated program. */
203 :
204 : ulong reg [ FD_VM_REG_MAX ]; /* registers, indexed [0,FD_VM_REG_CNT). Note that FD_VM_REG_MAX>FD_VM_REG_CNT.
205 : As such, malformed instructions, which can have src/dst reg index in
206 : [0,FD_VM_REG_MAX), cannot access info outside reg. Aligned 8. */
207 : fd_vm_shadow_t shadow[ FD_VM_STACK_FRAME_MAX ]; /* shadow stack, indexed [0,frame_cnt), if frame_cnt>0, 0/frame_cnt-1 is
208 : bottom/top. Aligned 16. */
209 : uchar stack [ FD_VM_STACK_MAX ]; /* stack, indexed [0,FD_VM_STACK_MAX). Divided into FD_VM_STACK_FRAME_MAX
210 : frames. Each frame has a FD_VM_STACK_GUARD_SZ region followed by a
211 : FD_VM_STACK_FRAME_SZ region. reg[10] gives the offset of the start of the
212 : current stack frame. Aligned 16. */
213 : uchar heap [ FD_VM_HEAP_MAX ]; /* syscall heap, [0,heap_sz) used, [heap_sz,heap_max) free. Aligned 8. */
214 :
215 : fd_sha256_t * sha; /* Pre-joined SHA instance. This should be re-initialised before every use. */
216 :
217 : ulong magic; /* ==FD_VM_MAGIC */
218 :
219 : int direct_mapping; /* If direct mapping feature flag is enabled */
220 : int syscall_parameter_address_restrictions; /* If syscall_parameter_address_restrictions feature flag is enabled */
221 : int virtual_address_space_adjustments; /* If virtual_address_space_adjustments feature flag is enabled */
222 :
223 : ulong stack_frame_sz; /* The size of a stack frame gap, in bytes. 0 if this is variable */
224 : ulong stack_push_frame_count; /* The number of stack frames to adjust the stack by on every stack push */
225 :
226 : /* Agave uses the segv vaddr in several different cases, including:
227 : - Determining whether or not to return a regular or stack access violation
228 : - (If direct mapping is enabled) determining the instruction error
229 : code to return on store operations. */
230 : ulong segv_vaddr;
231 : ulong segv_access_len;
232 : uchar segv_access_type;
233 :
234 : ulong sbpf_version; /* SBPF version, SIMD-0161 */
235 :
236 : int dump_syscall_to_pb; /* If true, syscalls will be dumped to the specified output directory */
237 : };
238 :
239 : /* FIXME: MOVE ABOVE INTO PRIVATE WHEN CONSTRUCTORS READY */
240 : /**********************************************************************/
241 :
242 : FD_PROTOTYPES_BEGIN
243 :
244 : /* FIXME: FD_VM_T NEEDS PROPER CONSTRUCTORS */
245 :
246 : /* FD_VM_{ALIGN,FOOTPRINT} describe the alignment and footprint needed
247 : for a memory region to hold a fd_vm_t. ALIGN is a positive
248 : integer power of 2. FOOTPRINT is a multiple of align.
249 : These are provided to facilitate compile time declarations. */
250 11181 : #define FD_VM_ALIGN FD_VM_HOST_REGION_ALIGN
251 5583 : #define FD_VM_FOOTPRINT (527856UL)
252 :
253 : /* fd_vm_{align,footprint} give the needed alignment and footprint
254 : of a memory region suitable to hold an fd_vm_t.
255 : Declaration / aligned_alloc / fd_alloca friendly (e.g. a memory
256 : region declared as "fd_vm_t _vm[1];", or created by
257 : "aligned_alloc(alignof(fd_vm_t),sizeof(fd_vm_t))" or created
258 : by "fd_alloca(alignof(fd_vm_t),sizeof(fd_vm_t))" will all
259 : automatically have the needed alignment and footprint).
260 : fd_vm_{align,footprint} return the same value as
261 : FD_VM_{ALIGN,FOOTPRINT}. */
262 : FD_FN_CONST ulong
263 : fd_vm_align( void );
264 :
265 : FD_FN_CONST ulong
266 : fd_vm_footprint( void );
267 :
268 5517 : #define FD_VM_MAGIC (0xF17EDA2CEF0) /* FIREDANCE SBPF V0 */
269 :
270 : /* fd_vm_new formats memory region with suitable alignment and
271 : footprint suitable for holding a fd_vm_t. Assumes
272 : shmem points on the caller to the first byte of the memory region
273 : owned by the caller to use. Returns shmem on success and NULL on
274 : failure (logs details). The memory region will be owned by the state
275 : on successful return. The caller is not joined on return. */
276 :
277 : void *
278 : fd_vm_new( void * shmem );
279 :
280 : /* fd_vm_join joins the caller to a vm.
281 : Assumes shmem points to the first byte of the memory region holding
282 : the vm. Returns a local handle to the join on success (this is
283 : not necessarily a simple cast of the address) and NULL on failure
284 : (logs details). */
285 : fd_vm_t *
286 : fd_vm_join( void * shmem );
287 :
288 : /* fd_vm_init initializes the given fd_vm_t struct, checking that it is
289 : not null and has the correct magic value.
290 :
291 : It modifies the vm object and also returns the object for convenience.
292 :
293 : FIXME: we should split out the memory mapping setup from this function
294 : to handle those errors separately. */
295 : fd_vm_t *
296 : fd_vm_init(
297 : fd_vm_t * vm,
298 : fd_exec_instr_ctx_t * instr_ctx,
299 : ulong heap_max,
300 : ulong entry_cu,
301 : uchar const * rodata,
302 : ulong rodata_sz,
303 : ulong const * text,
304 : ulong text_cnt,
305 : ulong text_off,
306 : ulong text_sz,
307 : ulong entry_pc,
308 : ulong const * calldests,
309 : ulong sbpf_version,
310 : fd_sbpf_syscalls_t * syscalls,
311 : fd_vm_trace_t * trace,
312 : fd_sha256_t * sha,
313 : fd_vm_input_region_t * mem_regions,
314 : uint mem_regions_cnt,
315 : fd_vm_acc_region_meta_t * acc_region_metas,
316 : uchar is_deprecated,
317 : int direct_mapping,
318 : int syscall_parameter_address_restrictions,
319 : int virtual_address_space_adjustments,
320 : int dump_syscall_to_pb,
321 : ulong r2_initial_value );
322 :
323 : /* fd_vm_leave leaves the caller's current local join to a vm.
324 : Returns a pointer to the memory region holding the vm on success
325 : (this is not necessarily a simple cast of the
326 : address) and NULL on failure (logs details). The caller is not
327 : joined on successful return. */
328 : void *
329 : fd_vm_leave( fd_vm_t * vm );
330 :
331 : /* fd_vm_delete unformats a memory region that holds a vm.
332 : Assumes shmem points on the caller to the first
333 : byte of the memory region holding the state and that nobody is
334 : joined. Returns a pointer to the memory region on success and NULL
335 : on failure (logs details). The caller has ownership of the memory
336 : region on successful return. */
337 : void *
338 : fd_vm_delete( void * shmem );
339 :
340 : /* fd_vm_validate validates the sBPF program in the given vm. Returns
341 : success or an error code. Called before executing a sBPF program.
342 : FIXME: DOCUMENT BETTER */
343 :
344 : FD_FN_PURE int
345 : fd_vm_validate( fd_vm_t const * vm );
346 :
347 : /* fd_vm_is_check_align_enabled returns 1 if the vm should check alignment
348 : when doing memory translation. */
349 : FD_FN_PURE static inline int
350 48492 : fd_vm_is_check_align_enabled( fd_vm_t const * vm ) {
351 48492 : return !vm->is_deprecated;
352 48492 : }
353 :
354 : /* fd_vm_is_check_size_enabled returns 1 if the vm should check size
355 : when doing memory translation. */
356 : FD_FN_PURE static inline int
357 0 : fd_vm_is_check_size_enabled( fd_vm_t const * vm ) {
358 0 : return !vm->is_deprecated;
359 0 : }
360 :
361 : /* fd_vm_exec runs vm from program start to program halt or program
362 : fault, appending an execution trace if vm is attached to a trace.
363 :
364 : Since this is running from program start, this will init r1 and r10,
365 : pop all stack frames and free all heap allocations.
366 :
367 : IMPORTANT SAFETY TIP! This currently does not zero out any other
368 : registers, the user stack region or the user heap. (FIXME: SHOULD
369 : IT??)
370 :
371 : Returns FD_VM_SUCCESS (0) on success and an FD_VM_ERR code (negative)
372 : on failure. Reasons for failure include:
373 :
374 : INVAL - NULL vm (or, for fd_vm_exec_trace, the vm is not
375 : attached to trace). FIXME: ADD OTHER INPUT ARG CHECKS?
376 :
377 : SIGTEXT - A jump/call set the program counter outside the text
378 : region or the program counter incremented beyond the
379 : text region. pc will be at the out of bounds location.
380 : ic and cu will not include the out of bounds location.
381 : For a call, the call stack frame was allocated.
382 :
383 : SIGSPLIT - A jump/call set the program counter into the middle of
384 : a multiword instruction or a multiword instruction went
385 : past the text region end. pc will be at the split. ic
386 : and cu will not include the split. For a call, the
387 : call stack frame was allocated.
388 :
389 : SIGCALL - A call set the program counter to a non-function
390 : location. pc will be at the non-function location. ic
391 : and cu will include the call but not include the
392 : non-function location. The call stack frame was
393 : allocated.
394 :
395 : SIGSTACK - The call depth limit was exceeded. pc will be at the
396 : call. ic and cu will include the call but not the call
397 : target. The call stack frame was not allocated.
398 :
399 : SIGILL - An invalid instruction was encountered (including an
400 : invalid opcode and an endian swap with an invalid bit
401 : width). pc will be at the invalid instruction. ic and
402 : cu will not include the invalid instruction.
403 :
404 : SIGSEGV - An invalid memory access (outside the program memory
405 : map) was encountered. pc will be at the faulting
406 : instruction. ic and cu will not include the faulting
407 : instruction.
408 :
409 : SIGBUS - An unaligned memory access was encountered. pc will be
410 : at the faulting instruction. ic and cu will not
411 : include the faulting instruction. (Note: currently
412 : mapped to SIGSEGV and then only if check_align is
413 : enabled.)
414 :
415 : SIGRDONLY - A write to read-only memory address was encountered.
416 : pc will be at the faulting instruction. ic and cu will
417 : not include the faulting instruction. (Note: currently
418 : mapped to SIGSEGV.)
419 :
420 : SIGCOST - The compute limit was exceeded. pc will be at the
421 : first non-executed instruction (if pc is a syscall, the
422 : syscall might have been partially executed when it ran
423 : out of budget .. see safety tip below). ic will cover
424 : all executed instructions. cu will be zero.
425 :
426 : This will considers any error returned by a syscall as a fault and
427 : returns the syscall error code here. See syscall documentation for
428 : details here. When a syscall faults, pc will be at the syscall, ic
429 : will include the syscall and cu will include the syscall and any
430 : additional costs the syscall might have incurred up to that point of
431 : the fault.
432 :
433 : IMPORTANT SAFETY TIP! Ideally, a syscall should only modify vm's
434 : state when it knows its overall syscall will be successful.
435 : Unfortunately, this is often not practical (e.g. a syscall starts
436 : processing a list of user provided commands and discovers an error
437 : condition late in the command list that did not exist at syscall
438 : start because the error condition was created by successfully
439 : executed commands earlier in the list). As such, vm's state on a
440 : faulting syscall may not be clean.
441 :
442 : FIXME: SINCE MOST SYSCALLS CAN BE IMPLEMENTED TO HAVE CLEAN FAULTING
443 : BEHAVIOR, PROVIDE A MECHANISM SO USERS CAN EASILY DETECT UNCLEAN
444 : SYSCALL FAULTS?
445 :
446 : For SIGCOST, note that the vm can speculate ahead when processing
447 : instructions. This makes it is possible to have a situation where
448 : a vm faults with, for example, SIGSEGV from a speculatively
449 : executed memory access while a non-speculative execution would have
450 : faulted with SIGCOST on an earlier instruction. In these situations,
451 : pc will be at the faulting speculatively executed instruction, ic
452 : will include all the speculatively executed instructions, cu will be
453 : zero and vm's state will include the impact of all the speculation.
454 :
455 : IMPORTANT SAFETY TIP! While different vm implementations can
456 : disagree on why a program faulted (e.g. SIGCOST versus SIGSEGV in the
457 : example above), they cannot disagree on whether or not a program
458 : faulted. As a result, the specific fault reason must never be
459 : allowed to be part of consensus.
460 :
461 : fd_vm_exec_trace runs with tracing and requires vm to be attached to
462 : a trace. fd_vm_exec_notrace runs without without tracing even if vm
463 : is attached to a trace. */
464 :
465 : int
466 : fd_vm_exec_trace( fd_vm_t * vm );
467 :
468 : int
469 : fd_vm_exec_notrace( fd_vm_t * vm );
470 :
471 : static inline int
472 2886 : fd_vm_exec( fd_vm_t * vm ) {
473 2886 : if( FD_UNLIKELY( vm->trace ) ) return fd_vm_exec_trace ( vm );
474 2886 : else return fd_vm_exec_notrace( vm );
475 2886 : }
476 :
477 : FD_PROTOTYPES_END
478 :
479 : #endif /* HEADER_fd_src_flamenco_vm_fd_vm_h */
|