Line data Source code
1 : #define _GNU_SOURCE
2 : #include "fd_xdp1.h"
3 :
4 : #include "fd_xdp_license.h"
5 : #include "../ebpf/fd_linux_bpf.h"
6 : #include "../ebpf/fd_ebpf_asm.h"
7 :
8 : #include <errno.h>
9 : #include <unistd.h>
10 : #include <net/if.h>
11 : #include <sys/syscall.h>
12 : #include <linux/bpf.h>
13 : #include <linux/if_link.h>
14 :
15 : /* Define some kernel uapi constants in case the user is compiling
16 : with older kernel headers. This is especially a problem on Ubuntu
17 : 20.04 which supports these functions, but doesn't have them in
18 : the default headers. */
19 :
20 : #ifndef BPF_LINK_CREATE
21 0 : #define BPF_LINK_CREATE (28)
22 : #endif
23 :
24 : #ifndef BPF_XDP
25 0 : #define BPF_XDP (37)
26 : #endif
27 :
28 : struct __attribute__((aligned(8))) bpf_link_create {
29 : uint prog_fd;
30 : uint target_ifindex;
31 : uint attach_type;
32 : uint flags;
33 : };
34 :
35 :
36 : /*
37 :
38 : ┌─────────────────────────────────────────┐
39 : │ Packet Classification Control Flow │
40 : | |
41 : │ ┌────────┐ │
42 : │ │Ethernet│ │
43 : │ └───┬────┘ │
44 : │ │ │
45 : │ ┌───▼────┐ │
46 : │ │ IPV4 │ │
47 : │ └──┬───┬─┘ │
48 : │ │ │ │
49 : │ │ └────►──────┐ │
50 : │ │ │ GRE │ │
51 : │ │ └───┬──┘ │
52 : │ │ │ │
53 : │ │ ┌───▼────────┐ │
54 : │ │ │ Inner IPV4 │ │
55 : │ │ ├────────────┘ │
56 : │ │ │ │
57 : │ ┌───▼──◄─────┘ │
58 : │ │ UDP │ │
59 : │ └──┬───┘ │
60 : | │ |
61 : | ┌──────▼───────┐ ┌──────────────┐ |
62 : | | LBL_REDIRECT | | LBL_PASS | |
63 : | └──────────────┘ └──────────────┘ |
64 : └─────────────────────────────────────────┘
65 :
66 : fd_xdp_gen_program dynamically generates an eBPF bytecode program to
67 : classify incoming network packets in an XDP context. Each box in the above
68 : diagram represents a phase during the classification. NON-GRE packets follow
69 : the path of Ethernet-->IPV4-->UDP. GRE packets follow the path of
70 : Ethernet-->IPV4-->GRE-->Inner IPV4-->UDP. Based on the packet's protocol and
71 : specific header values, the function decides whether to redirect the packet
72 : to the firedancer net tile (LBL_REDIRECT) or pass the packet to the
73 : kernel (LBL_PASS).
74 : */
75 : ulong
76 : fd_xdp_gen_program( ulong code_buf[ 512 ],
77 : int xsks_fd,
78 : uint listen_ip4_addr,
79 : ushort const * ports,
80 : ulong ports_cnt,
81 0 : int allowed_gre ) {
82 :
83 0 : #define LBL_PASS 1 // Pass the packet to the kernel
84 0 : #define LBL_REDIRECT 2 // Redirect the packet to firedancer software
85 :
86 0 : #define LBL_GRE_CHECK 3 // Ethernet-->IPV4-->GRE-->Inner IPV4-->UDP
87 0 : #define LBL_UDP_CHECK 4 // Ethernet-->IPV4-->UDP
88 :
89 0 : if( FD_UNLIKELY( ports_cnt>16UL ) ) {
90 0 : FD_LOG_ERR(( "Too many XDP UDP ports (%lu)", ports_cnt ));
91 0 : }
92 :
93 0 : ulong * code = code_buf;
94 0 : *(code++) = FD_EBPF( ldxw, r2, r1, 0 ); // r2 = xdp_md->data
95 0 : *(code++) = FD_EBPF( ldxw, r3, r1, 4 ); // r3 = xdp_md->data_end
96 :
97 0 : *(code++) = FD_EBPF( mov64_reg, r5, r2 );
98 0 : *(code++) = FD_EBPF( add64_imm, r5, 34 ); // Bound check accessing the eth_hdr (14 bytes) and the ip4_hdr (20 bytes)
99 0 : *(code++) = FD_EBPF( jgt_reg, r5, r3, LBL_PASS ); // if r2+34 > r3 goto LBL_PASS
100 :
101 0 : *(code++) = FD_EBPF( ldxh, r5, r2, 12 );
102 0 : *(code++) = FD_EBPF( jne_imm, r5, 0x0008, LBL_PASS ); // if eth_hdr->net_type != IP4 goto LBL_PASS
103 :
104 : /* Advance r2 to the start of first ip4_hdr */
105 0 : *(code++) = FD_EBPF( add64_imm, r2, 14 );
106 :
107 : /* Calculate the start of next hdr and store in r4 */
108 0 : *(code++) = FD_EBPF( ldxb, r4, r2, 0 ); // r4 = ip4_hdr->verihl
109 0 : *(code++) = FD_EBPF( and64_imm, r4, 0x0f ); // r4 = ip4_hdr->ihl (lsb of ip4_hrd->verihl)
110 0 : *(code++) = FD_EBPF( lsh64_imm, r4, 2 ); // r4 = ip4_hdr->ihl*4 (length of ipv4 header)
111 0 : *(code++) = FD_EBPF( jlt_imm, r4, 20, LBL_PASS ); // if r4<20 goto LBL_PASS
112 0 : *(code++) = FD_EBPF( add64_reg, r4, r2 ); // r4 = &ip4_hdr + length of ip4_hdr = start of next hdr
113 :
114 : /* Check if the next hdr is udp or gre */
115 0 : *(code++) = FD_EBPF( ldxb, r5, r2, 9 ); // r5 = ip4_hdr->protocol
116 :
117 0 : if( allowed_gre==1 ) {
118 0 : *(code++) = FD_EBPF( jeq_imm, r5, 47, LBL_GRE_CHECK ); // if ip4_hdr->protocol == GRE goto gre_check
119 0 : }
120 :
121 0 : *(code++) = FD_EBPF( jeq_imm, r5, 17, LBL_UDP_CHECK ); // if ip4_hdr->protocol == UDP goto udp_check
122 0 : *(code++) = FD_EBPF( ja, LBL_PASS ); // goto LBL_PASS
123 :
124 :
125 : /* next hdr is gre */
126 0 : ulong * gre_check = code;
127 :
128 : /* Advance r2 to start of gre_hdr */
129 0 : *(code++) = FD_EBPF( mov64_reg, r2, r4 );
130 :
131 : /*
132 : * At this point:
133 : * r1: xdp_md
134 : * r2: start of next header (gre_hdr)
135 : * r3: xdp_md->data_end
136 : * r4: clobber
137 : * r5: clobber
138 : */
139 :
140 : /* Bound check GRE and inner ip4_hdr access */
141 0 : *(code++) = FD_EBPF( mov64_reg, r5, r2 );
142 0 : *(code++) = FD_EBPF( add64_imm, r5, 24 ); // r5 = 1 byte past inner ip4_hdr. sizeof(gre_hdr) + sizeof(ip4_hdr) = 4 + 20
143 0 : *(code++) = FD_EBPF( jgt_reg, r5, r3, LBL_PASS ); // if (end of inner ip4_hdr + 1) > r3, goto LBL_PASS
144 :
145 :
146 : /* Verify GRE fields */
147 0 : *(code++) = FD_EBPF( ldxh, r5, r2, 0 ); // r5 = gre_hdr->flags/version
148 0 : *(code++) = FD_EBPF( jne_imm, r5, 0x0000, LBL_PASS ); // if gre_hdr->flags/version != 0, goto LBL_PASS
149 0 : *(code++) = FD_EBPF( ldxh, r5, r2, 2 ); // r5 = gre_hdr->protocol
150 0 : *(code++) = FD_EBPF( jne_imm, r5, 0x0008, LBL_PASS ); // if gre_hdr->protocol != IP, goto LBL_PASS
151 :
152 :
153 : /* Advance r2 to start of inner ip4_hdr */
154 0 : *(code++) = FD_EBPF( add64_imm, r2, 4 ); // r2 = start of inner ip4_hdr
155 :
156 : /* Check inner ip4's encapsulated protocol */
157 0 : *(code++) = FD_EBPF( ldxb, r5, r2, 9 ); // r5 = inner ip4_hdr->protocol
158 0 : *(code++) = FD_EBPF( jne_imm, r5, 17, LBL_PASS ); // if r5!=UDP, goto LBL_PASS
159 :
160 : /* Calculate the start of udp_hdr and store in r4 */
161 0 : *(code++) = FD_EBPF( ldxb, r4, r2, 0 ); // r4 = inner ip4_hdr->verihl
162 0 : *(code++) = FD_EBPF( and64_imm, r4, 0x0f ); // r4 = inner ip4_hdr->ihl
163 0 : *(code++) = FD_EBPF( lsh64_imm, r4, 2 ); // r4 = ip4_hdr->ihl*4 (length of ipv4 header)
164 0 : *(code++) = FD_EBPF( jlt_imm, r4, 20, LBL_PASS ); // if r4<20 goto LBL_PASS
165 0 : *(code++) = FD_EBPF( add64_reg, r4, r2 ); // r4 = start of udp_hdr
166 :
167 : /*
168 : * At this point:
169 : * r1: &xdp_md
170 : * r2: start of ip4_hdr (inner ip4_hdr for gre)
171 : * r3: xdp_md->data_end
172 : * r4: start of udp_hdr
173 : * r5: clobber
174 : */
175 :
176 : /* udp check */
177 0 : ulong * udp_check = code;
178 :
179 : /* check ip4's dst addr */
180 0 : if( listen_ip4_addr!=0 ) {
181 0 : *(code++) = FD_EBPF( ldxw, r5, r2, 16 );
182 0 : *(code++) = FD_EBPF( jne_imm, r5, listen_ip4_addr, LBL_PASS ); // if ip4->daddr != listen_ip4_addr goto LBL_PASS
183 0 : }
184 :
185 : /* Advance r2 to start of udp_hdr */
186 0 : *(code++) = FD_EBPF( mov64_reg, r2, r4 );
187 :
188 : /* bound check udp hdr access */
189 0 : *(code++) = FD_EBPF( add64_imm, r4, 8 ); // r4 += sizeof(udp_hdr) = 1 byte pass the end of udp_hdr
190 0 : *(code++) = FD_EBPF( jgt_reg, r4, r3, LBL_PASS ); // if (end of udp_hdr + 1) > r3 goto LBL_PASS
191 :
192 : /* get destination port from udp_hdr */
193 0 : *(code++) = FD_EBPF( ldxh, r4, r2, 2 ); // r4 = udp_hdr->dst_port
194 :
195 : /* loop through the ports array and find a match with dst_port */
196 0 : for( ulong i=0UL; i<ports_cnt; i++ ) {
197 0 : ushort port = (ushort)fd_ushort_bswap( ports[ i ] );
198 0 : if( !port ) continue;
199 0 : *(code++) = FD_EBPF( jeq_imm, r4, port, LBL_REDIRECT ); // if dst_port == ports[i] goto LBL_REDIRECT
200 0 : }
201 :
202 0 : ulong * lbl_pass = code;
203 0 : *(code++) = FD_EBPF( mov64_imm, r0, XDP_PASS );
204 0 : *(code++) = FD_EBPF_exit; // return XDP_PASS
205 0 : ulong * lbl_redirect = code;
206 0 : *(code++) = FD_EBPF( ldxw, r2, r1, 16 ); // r2 = xdp_md->rx_queue_index
207 0 : *(code++) = FD_EBPF( lddw, r1, xsks_fd ); // r1 = xsk_map_fd ll
208 0 : *(code++) = 0;
209 0 : *(code++) = FD_EBPF( mov64_imm, r3, 0 ); // r3 = 0
210 0 : *(code++) = FD_EBPF( call, 0x33 );
211 0 : *(code++) = FD_EBPF_exit; // return bpf_redirect_map(r1,r2,r3)
212 :
213 0 : ulong * code_end = code;
214 0 : ulong code_cnt = (ulong)( code_end-code_buf );
215 :
216 0 : FD_LOG_HEXDUMP_DEBUG(( "XDP program", code_buf, code_cnt*sizeof(ulong) ));
217 :
218 : /* Fill in jump labels */
219 :
220 0 : for( ulong i=0UL; i<code_cnt; i++ ) {
221 0 : if( (code_buf[ i ] & 0x07)==0x05 ) {
222 0 : ulong * jmp_target = 0;
223 0 : uint jmp_label = (code_buf[ i ]>>16) & 0xFFFF;
224 0 : switch( jmp_label ) {
225 0 : case 0: continue;
226 0 : case LBL_PASS: jmp_target = lbl_pass; break;
227 0 : case LBL_REDIRECT: jmp_target = lbl_redirect; break;
228 0 : case LBL_GRE_CHECK: jmp_target = gre_check; break;
229 0 : case LBL_UDP_CHECK: jmp_target = udp_check; break;
230 0 : default: FD_LOG_ERR(( "Invalid jump instruction (%016lx)", fd_ulong_bswap( code_buf[ i ] ) ));
231 0 : }
232 0 : long off = jmp_target-code_buf-(long)i-1;
233 0 : ushort off_u = (ushort)(short)off;
234 0 : code_buf[ i ] = (code_buf[ i ] & 0xFFFFFFFF0000FFFF) | ((ulong)off_u<<16UL);
235 0 : }
236 0 : }
237 :
238 0 : #undef LBL_PASS
239 0 : #undef LBL_REDIRECT
240 :
241 0 : #undef LBL_GRE_CHECK
242 0 : #undef LBL_UDP_CHECK
243 0 : return code_cnt;
244 0 : }
245 :
246 : fd_xdp_fds_t
247 : fd_xdp_install( uint if_idx,
248 : uint listen_ip4_addr,
249 : ulong ports_cnt,
250 : ushort const * ports,
251 0 : char const * xdp_mode ) {
252 : /* Check args */
253 :
254 0 : uint uxdp_mode = 0;
255 0 : if( !strcmp( xdp_mode, "skb" ) ) uxdp_mode = XDP_FLAGS_SKB_MODE;
256 0 : else if( !strcmp( xdp_mode, "drv" ) ) uxdp_mode = XDP_FLAGS_DRV_MODE;
257 0 : else if( !strcmp( xdp_mode, "hw" ) ) uxdp_mode = XDP_FLAGS_HW_MODE;
258 0 : else if( !strcmp( xdp_mode, "default" ) ) uxdp_mode = 0U;
259 0 : else FD_LOG_ERR(( "unknown XDP mode `%s`", xdp_mode ));
260 :
261 0 : uint true_port_cnt = 0U;
262 0 : for( ulong i=0UL; i<ports_cnt; i++ ) true_port_cnt += !!ports[ i ];
263 0 : if( FD_UNLIKELY( !true_port_cnt ) ) FD_LOG_ERR(( "XDP program is not listening on any UDP ports" ));
264 :
265 : /* Create XSK map */
266 :
267 0 : union bpf_attr attr2 = {
268 0 : .map_type = BPF_MAP_TYPE_XSKMAP,
269 0 : .key_size = 4U,
270 0 : .value_size = 4U,
271 0 : .max_entries = 256U,
272 0 : .map_name = "fd_xdp_xsks"
273 0 : };
274 0 : int xsk_map_fd = (int)bpf( BPF_MAP_CREATE, &attr2, sizeof(union bpf_attr) );
275 0 : if( FD_UNLIKELY( -1==xsk_map_fd ) ) FD_LOG_ERR(( "Failed to create XSKMAP (%i-%s)", errno, fd_io_strerror( errno ) ));
276 :
277 : /* Load eBPF program into kernel */
278 :
279 0 : ulong code_buf[ 512 ];
280 0 : ulong code_cnt = fd_xdp_gen_program( code_buf, xsk_map_fd, listen_ip4_addr, ports, ports_cnt, 1 );
281 :
282 0 : char ebpf_kern_log[ 32768UL ];
283 :
284 : /* Work around a compiler bug: Clang+ASan fails to zero-initialize the
285 : entire struct if we use union assignment syntax. (It memsets 148
286 : bytes instead of 152, leaving 4 trailing bytes uninitialized, which
287 : fails in BPF_PROG_LOAD) */
288 0 : union bpf_attr attr = {0};
289 0 : attr.prog_type = BPF_PROG_TYPE_XDP;
290 0 : attr.insn_cnt = (uint)code_cnt;
291 0 : attr.insns = (ulong)code_buf;
292 0 : attr.license = (ulong)FD_LICENSE;
293 0 : attr.log_level = 6;
294 0 : attr.log_size = 32768UL;
295 0 : attr.log_buf = (ulong)ebpf_kern_log;
296 :
297 0 : int prog_fd = (int)bpf( BPF_PROG_LOAD, &attr, sizeof(union bpf_attr) );
298 0 : if( FD_UNLIKELY( -1==prog_fd ) ) {
299 0 : FD_LOG_WARNING(( "bpf(BPF_PROG_LOAD) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
300 0 : FD_LOG_ERR(( "eBPF verifier log:\n%s", ebpf_kern_log ));
301 0 : }
302 :
303 : /* Install program to device */
304 :
305 0 : struct bpf_link_create link_create = {
306 0 : .prog_fd = (uint)prog_fd,
307 0 : .target_ifindex = if_idx,
308 0 : .attach_type = BPF_XDP,
309 0 : .flags = uxdp_mode
310 0 : };
311 :
312 0 : int prog_link_fd = (int)bpf( BPF_LINK_CREATE, fd_type_pun( &link_create ), sizeof(struct bpf_link_create) );
313 0 : if( FD_UNLIKELY( -1==prog_link_fd ) ) {
314 0 : if( FD_LIKELY( errno==ENOSYS ) ) {
315 0 : FD_LOG_ERR(( "BPF_LINK_CREATE is not supported by your kernel (%i-%s). Firedancer requires a Linux "
316 0 : "kernel version of v5.7 or newer to support fast XDP networking. Please upgrade to a newer "
317 0 : "kernel version.", errno, fd_io_strerror( errno ) ));
318 0 : } else if( FD_LIKELY( errno==EINVAL ) ) {
319 0 : char if_name[ IF_NAMESIZE ] = {0};
320 0 : FD_LOG_ERR(( "BPF_LINK_CREATE failed on interface %s (%i-%s). This likely means the network device "
321 0 : "does not have support for XDP. If the device is a bonding device, you will need "
322 0 : "a kernel version of v5.15 or newer. For other devices, see the list of kernel "
323 0 : "support at https://github.com/iovisor/bcc/blob/master/docs/kernel-versions.md#xdp",
324 0 : if_indextoname( if_idx, if_name ), errno, fd_io_strerror( errno ) ));
325 0 : } else {
326 0 : FD_LOG_ERR(( "BPF_LINK_CREATE(if_idx=%u) failed (%i-%s)", if_idx, errno, fd_io_strerror( errno ) ));
327 0 : }
328 0 : }
329 :
330 0 : if( FD_UNLIKELY( -1==close( prog_fd ) ) ) FD_LOG_ERR(( "close(%d) failed (%i-%s)", prog_fd, errno, fd_io_strerror( errno ) ));
331 :
332 0 : return (fd_xdp_fds_t){
333 0 : .xsk_map_fd = xsk_map_fd,
334 0 : .prog_link_fd = prog_link_fd,
335 0 : };
336 0 : }
|