Line data Source code
1 : #define _GNU_SOURCE
2 : #include "fd_xdp1.h"
3 :
4 : #include "fd_xdp_license.h"
5 : #include "../ebpf/fd_linux_bpf.h"
6 : #include "../ebpf/fd_ebpf_asm.h"
7 :
8 : #include <errno.h>
9 : #include <unistd.h>
10 : #include <net/if.h>
11 : #include <sys/syscall.h>
12 : #include <linux/bpf.h>
13 : #include <linux/if_link.h>
14 :
15 : /* Define some kernel uapi constants in case the user is compiling
16 : with older kernel headers. This is especially a problem on Ubuntu
17 : 20.04 which supports these functions, but doesn't have them in
18 : the default headers. */
19 :
20 : #ifndef BPF_LINK_CREATE
21 0 : #define BPF_LINK_CREATE (28)
22 : #endif
23 :
24 : #ifndef BPF_XDP
25 0 : #define BPF_XDP (37)
26 : #endif
27 :
28 : struct __attribute__((aligned(8))) bpf_link_create {
29 : uint prog_fd;
30 : uint target_ifindex;
31 : uint attach_type;
32 : uint flags;
33 : };
34 :
35 :
36 : /*
37 :
38 : ┌─────────────────────────────────────────┐
39 : │ Packet Classification Control Flow │
40 : | |
41 : │ ┌────────┐ │
42 : │ │Ethernet│ │
43 : │ └───┬────┘ │
44 : │ │ │
45 : │ ┌───▼────┐ │
46 : │ │ IPV4 │ │
47 : │ └──┬───┬─┘ │
48 : │ │ │ │
49 : │ │ └────►──────┐ │
50 : │ │ │ GRE │ │
51 : │ │ └───┬──┘ │
52 : │ │ │ │
53 : │ │ ┌───▼────────┐ │
54 : │ │ │ Inner IPV4 │ │
55 : │ │ ├────────────┘ │
56 : │ │ │ │
57 : │ ┌───▼──◄─────┘ │
58 : │ │ UDP │ │
59 : │ └──┬───┘ │
60 : | │ |
61 : | ┌──────▼───────┐ ┌──────────────┐ |
62 : | | LBL_REDIRECT | | LBL_PASS | |
63 : | └──────────────┘ └──────────────┘ |
64 : └─────────────────────────────────────────┘
65 :
66 : fd_xdp_gen_program dynamically generates an eBPF bytecode program to
67 : classify incoming network packets in an XDP context. Each box in the above
68 : diagram represents a phase during the classification. NON-GRE packets follow
69 : the path of Ethernet-->IPV4-->UDP. GRE packets follow the path of
70 : Ethernet-->IPV4-->GRE-->Inner IPV4-->UDP. Based on the packet's protocol and
71 : specific header values, the function decides whether to redirect the packet
72 : to the firedancer net tile (LBL_REDIRECT) or pass the packet to the
73 : kernel (LBL_PASS).
74 : */
75 : ulong
76 : fd_xdp_gen_program( ulong code_buf[ 512 ],
77 : int xsks_fd,
78 : uint listen_ip4_addr,
79 : ushort const * ports,
80 : ulong ports_cnt,
81 0 : int allowed_gre ) {
82 :
83 0 : #define LBL_PASS 1 // Pass the packet to the kernel
84 0 : #define LBL_REDIRECT 2 // Redirect the packet to firedancer software
85 :
86 0 : #define LBL_GRE_CHECK 3 // Ethernet-->IPV4-->GRE-->Inner IPV4-->UDP
87 0 : #define LBL_UDP_CHECK 4 // Ethernet-->IPV4-->UDP
88 :
89 0 : if( FD_UNLIKELY( ports_cnt>16UL ) ) {
90 0 : FD_LOG_ERR(( "Too many XDP UDP ports (%lu)", ports_cnt ));
91 0 : }
92 :
93 0 : ulong * code = code_buf;
94 0 : *(code++) = FD_EBPF( ldxw, r2, r1, 0 ); // r2 = xdp_md->data
95 0 : *(code++) = FD_EBPF( ldxw, r3, r1, 4 ); // r3 = xdp_md->data_end
96 :
97 0 : *(code++) = FD_EBPF( mov64_reg, r5, r2 );
98 0 : *(code++) = FD_EBPF( add64_imm, r5, 34 ); // Bound check accessing the eth_hdr (14 bytes) and the ip4_hdr (20 bytes)
99 0 : *(code++) = FD_EBPF( jgt_reg, r5, r3, LBL_PASS ); // if r2+34 > r3 goto LBL_PASS
100 :
101 0 : *(code++) = FD_EBPF( ldxh, r5, r2, 12 );
102 0 : *(code++) = FD_EBPF( jne_imm, r5, 0x0008, LBL_PASS ); // if eth_hdr->net_type != IP4 goto LBL_PASS
103 :
104 : /* Advance r2 to the start of first ip4_hdr */
105 0 : *(code++) = FD_EBPF( add64_imm, r2, 14 );
106 :
107 : /* Calculate the start of next hdr and store in r4 */
108 0 : *(code++) = FD_EBPF( ldxb, r4, r2, 0 ); // r4 = ip4_hdr->verihl
109 0 : *(code++) = FD_EBPF( and64_imm, r4, 0x0f ); // r4 = ip4_hdr->ihl (lsb of ip4_hrd->verihl)
110 0 : *(code++) = FD_EBPF( lsh64_imm, r4, 2 ); // r4 = ip4_hdr->ihl*4 (length of ipv4 header)
111 0 : *(code++) = FD_EBPF( add64_reg, r4, r2 ); // r4 = &ip4_hdr + length of ip4_hdr = start of next hdr
112 :
113 : /* Check if the next hdr is udp or gre */
114 0 : *(code++) = FD_EBPF( ldxb, r5, r2, 9 ); // r5 = ip4_hdr->protocol
115 :
116 0 : if( allowed_gre==1 ) {
117 0 : *(code++) = FD_EBPF( jeq_imm, r5, 47, LBL_GRE_CHECK ); // if ip4_hdr->protocol == GRE goto gre_check
118 0 : }
119 :
120 0 : *(code++) = FD_EBPF( jeq_imm, r5, 17, LBL_UDP_CHECK ); // if ip4_hdr->protocol == UDP goto udp_check
121 0 : *(code++) = FD_EBPF( ja, LBL_PASS ); // goto LBL_PASS
122 :
123 :
124 : /* next hdr is gre */
125 0 : ulong * gre_check = code;
126 :
127 : /* Advance r2 to start of gre_hdr */
128 0 : *(code++) = FD_EBPF( mov64_reg, r2, r4 );
129 :
130 : /*
131 : * At this point:
132 : * r1: xdp_md
133 : * r2: start of next header (gre_hdr)
134 : * r3: xdp_md->data_end
135 : * r4: clobber
136 : * r5: clobber
137 : */
138 :
139 : /* Bound check GRE and inner ip4_hdr access */
140 0 : *(code++) = FD_EBPF( mov64_reg, r5, r2 );
141 0 : *(code++) = FD_EBPF( add64_imm, r5, 24 ); // r5 = 1 byte past inner ip4_hdr. sizeof(gre_hdr) + sizeof(ip4_hdr) = 4 + 20
142 0 : *(code++) = FD_EBPF( jgt_reg, r5, r3, LBL_PASS ); // if (end of inner ip4_hdr + 1) > r3, goto LBL_PASS
143 :
144 :
145 : /* Verify GRE fields */
146 0 : *(code++) = FD_EBPF( ldxh, r5, r2, 0 ); // r5 = gre_hdr->flags/version
147 0 : *(code++) = FD_EBPF( jne_imm, r5, 0x0000, LBL_PASS ); // if gre_hdr->flags/version != 0, goto LBL_PASS
148 0 : *(code++) = FD_EBPF( ldxh, r5, r2, 2 ); // r5 = gre_hdr->protocol
149 0 : *(code++) = FD_EBPF( jne_imm, r5, 0x0008, LBL_PASS ); // if gre_hdr->protocl != IP, goto LBL_PASS
150 :
151 :
152 : /* Advance r2 to start of inner ip4_hdr */
153 0 : *(code++) = FD_EBPF( add64_imm, r2, 4 ); // r2 = start of inner ip4_hdr
154 :
155 : /* Check inner ip4's encapsulated protocol */
156 0 : *(code++) = FD_EBPF( ldxb, r5, r2, 9 ); // r5 = inner ip4_hdr->protocol
157 0 : *(code++) = FD_EBPF( jne_imm, r5, 17, LBL_PASS ); // if r5!=UDP, goto LBL_PASS
158 :
159 : /* Calculate the start of udp_hdr and store in r4 */
160 0 : *(code++) = FD_EBPF( ldxb, r4, r2, 0 ); // r4 = inner ip4_hdr->verihl
161 0 : *(code++) = FD_EBPF( and64_imm, r4, 0x0f ); // r4 = inner ip4_hdr->ihl
162 0 : *(code++) = FD_EBPF( lsh64_imm, r4, 2 ); // r4 = ip4_hdr->ihl*4 (length of ipv4 header)
163 0 : *(code++) = FD_EBPF( add64_reg, r4, r2 ); // r4 = start of udp_hdr
164 :
165 : /*
166 : * At this point:
167 : * r1: &xdp_md
168 : * r2: start of ip4_hdr (inner ip4_hdr for gre)
169 : * r3: xdp_md->data_end
170 : * r4: start of udp_hdr
171 : * r5: clobber
172 : */
173 :
174 : /* udp check */
175 0 : ulong * udp_check = code;
176 :
177 : /* check ip4's dst port */
178 0 : if( listen_ip4_addr!=0 ) {
179 0 : *(code++) = FD_EBPF( ldxw, r5, r2, 16 );
180 0 : *(code++) = FD_EBPF( jne_imm, r5, listen_ip4_addr, LBL_PASS ); // if ip4->daddr != listen_ip4_addr goto LBL_PASS
181 0 : }
182 :
183 : /* Advance r2 to start of udp_hdr */
184 0 : *(code++) = FD_EBPF( mov64_reg, r2, r4 );
185 :
186 : /* bound check udp hdr access */
187 0 : *(code++) = FD_EBPF( add64_imm, r4, 8 ); // r4 += sizeof(udp_hdr) = 1 byte pass the end of udp_hdr
188 0 : *(code++) = FD_EBPF( jgt_reg, r4, r3, LBL_PASS ); // if (end of udp_hdr + 1) > r3 goto LBL_PASS
189 :
190 : /* get destination port from udp_hdr */
191 0 : *(code++) = FD_EBPF( ldxh, r4, r2, 2 ); // r4 = udp_hdr->dst_port
192 :
193 : /* loop through the ports array and find a match with dst_port */
194 0 : for( ulong i=0UL; i<ports_cnt; i++ ) {
195 0 : ushort port = (ushort)fd_ushort_bswap( ports[ i ] );
196 0 : if( !port ) continue;
197 0 : *(code++) = FD_EBPF( jeq_imm, r4, port, LBL_REDIRECT ); // if dst_port == ports[i] goto LBL_REDIRECT
198 0 : }
199 :
200 0 : ulong * lbl_pass = code;
201 0 : *(code++) = FD_EBPF( mov64_imm, r0, XDP_PASS );
202 0 : *(code++) = FD_EBPF_exit; // return XDP_PASS
203 0 : ulong * lbl_redirect = code;
204 0 : *(code++) = FD_EBPF( ldxw, r2, r1, 16 ); // r2 = xdp_md->rx_queue_index
205 0 : *(code++) = FD_EBPF( lddw, r1, xsks_fd ); // r1 = xsk_map_fd ll
206 0 : *(code++) = 0;
207 0 : *(code++) = FD_EBPF( mov64_imm, r3, 0 ); // r3 = 0
208 0 : *(code++) = FD_EBPF( call, 0x33 );
209 0 : *(code++) = FD_EBPF_exit; // return bpf_redirect_map(r1,r2,r3)
210 :
211 0 : ulong * code_end = code;
212 0 : ulong code_cnt = (ulong)( code_end-code_buf );
213 :
214 0 : FD_LOG_HEXDUMP_DEBUG(( "XDP program", code_buf, code_cnt*sizeof(ulong) ));
215 :
216 : /* Fill in jump labels */
217 :
218 0 : for( ulong i=0UL; i<code_cnt; i++ ) {
219 0 : if( (code_buf[ i ] & 0x05)==0x05 ) {
220 0 : ulong * jmp_target = 0;
221 0 : uint jmp_label = (code_buf[ i ]>>16) & 0xFFFF;
222 0 : switch( jmp_label ) {
223 0 : case 0: continue;
224 0 : case LBL_PASS: jmp_target = lbl_pass; break;
225 0 : case LBL_REDIRECT: jmp_target = lbl_redirect; break;
226 0 : case LBL_GRE_CHECK: jmp_target = gre_check; break;
227 0 : case LBL_UDP_CHECK: jmp_target = udp_check; break;
228 0 : default: FD_LOG_ERR(( "Invalid jump instruction (%016lx)", fd_ulong_bswap( code_buf[ i ] ) ));
229 0 : }
230 0 : long off = jmp_target-code_buf-(long)i-1;
231 0 : ushort off_u = (ushort)(short)off;
232 0 : code_buf[ i ] = (code_buf[ i ] & 0xFFFFFFFF0000FFFF) | ((ulong)off_u<<16UL);
233 0 : }
234 0 : }
235 :
236 0 : #undef LBL_PASS
237 0 : #undef LBL_REDIRECT
238 :
239 0 : #undef LBL_GRE_CHECK
240 0 : #undef LBL_UDP_CHECK
241 0 : return code_cnt;
242 0 : }
243 :
244 : fd_xdp_fds_t
245 : fd_xdp_install( uint if_idx,
246 : uint listen_ip4_addr,
247 : ulong ports_cnt,
248 : ushort const * ports,
249 0 : char const * xdp_mode ) {
250 : /* Check args */
251 :
252 0 : uint uxdp_mode = 0;
253 0 : if( !strcmp( xdp_mode, "skb" ) ) uxdp_mode = XDP_FLAGS_SKB_MODE;
254 0 : else if( !strcmp( xdp_mode, "drv" ) ) uxdp_mode = XDP_FLAGS_DRV_MODE;
255 0 : else if( !strcmp( xdp_mode, "hw" ) ) uxdp_mode = XDP_FLAGS_HW_MODE;
256 0 : else if( !strcmp( xdp_mode, "generic" ) ) uxdp_mode = 0U;
257 0 : else FD_LOG_ERR(( "unknown XDP mode `%s`", xdp_mode ));
258 :
259 0 : uint true_port_cnt = 0U;
260 0 : for( ulong i=0UL; i<ports_cnt; i++ ) true_port_cnt += !!ports[ i ];
261 0 : if( FD_UNLIKELY( !true_port_cnt ) ) FD_LOG_ERR(( "XDP program is not listening on any UDP ports" ));
262 :
263 : /* Create XSK map */
264 :
265 0 : union bpf_attr attr2 = {
266 0 : .map_type = BPF_MAP_TYPE_XSKMAP,
267 0 : .key_size = 4U,
268 0 : .value_size = 4U,
269 0 : .max_entries = 256U,
270 0 : .map_name = "fd_xdp_xsks"
271 0 : };
272 0 : int xsk_map_fd = (int)bpf( BPF_MAP_CREATE, &attr2, sizeof(union bpf_attr) );
273 0 : if( FD_UNLIKELY( -1==xsk_map_fd ) ) FD_LOG_ERR(( "Failed to create XSKMAP (%i-%s)", errno, fd_io_strerror( errno ) ));
274 :
275 : /* Load eBPF program into kernel */
276 :
277 0 : ulong code_buf[ 512 ];
278 0 : ulong code_cnt = fd_xdp_gen_program( code_buf, xsk_map_fd, listen_ip4_addr, ports, ports_cnt, 1 );
279 :
280 0 : char ebpf_kern_log[ 32768UL ];
281 0 : union bpf_attr attr = {
282 0 : .prog_type = BPF_PROG_TYPE_XDP,
283 0 : .insn_cnt = (uint)code_cnt,
284 0 : .insns = (ulong)code_buf,
285 0 : .license = (ulong)FD_LICENSE,
286 : /* Verifier logs */
287 0 : .log_level = 6,
288 0 : .log_size = 32768UL,
289 0 : .log_buf = (ulong)ebpf_kern_log
290 0 : };
291 0 : int prog_fd = (int)bpf( BPF_PROG_LOAD, &attr, sizeof(union bpf_attr) );
292 0 : if( FD_UNLIKELY( -1==prog_fd ) ) {
293 0 : FD_LOG_WARNING(( "bpf(BPF_PROG_LOAD) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
294 0 : FD_LOG_ERR(( "eBPF verifier log:\n%s", ebpf_kern_log ));
295 0 : }
296 :
297 : /* Install program to device */
298 :
299 0 : struct bpf_link_create link_create = {
300 0 : .prog_fd = (uint)prog_fd,
301 0 : .target_ifindex = if_idx,
302 0 : .attach_type = BPF_XDP,
303 0 : .flags = uxdp_mode
304 0 : };
305 :
306 0 : int prog_link_fd = (int)bpf( BPF_LINK_CREATE, fd_type_pun( &link_create ), sizeof(struct bpf_link_create) );
307 0 : if( FD_UNLIKELY( -1==prog_link_fd ) ) {
308 0 : if( FD_LIKELY( errno==ENOSYS ) ) {
309 0 : FD_LOG_ERR(( "BPF_LINK_CREATE is not supported by your kernel (%i-%s). Firedancer requires a Linux "
310 0 : "kernel version of v5.7 or newer to support fast XDP networking. Please upgrade to a newer "
311 0 : "kernel version.", errno, fd_io_strerror( errno ) ));
312 0 : } else if( FD_LIKELY( errno==EINVAL ) ) {
313 0 : char if_name[ IF_NAMESIZE ] = {0};
314 0 : FD_LOG_ERR(( "BPF_LINK_CREATE failed on interface %s (%i-%s). This likely means the network device "
315 0 : "does not have support for XDP. If the device is a bonding device, you will need "
316 0 : "a kernel version of v5.15 or newer. For other devices, see the list of kernel "
317 0 : "support at https://github.com/iovisor/bcc/blob/master/docs/kernel-versions.md#xdp",
318 0 : if_indextoname( if_idx, if_name ), errno, fd_io_strerror( errno ) ));
319 0 : } else {
320 0 : FD_LOG_ERR(( "BPF_LINK_CREATE failed (%i-%s)", errno, fd_io_strerror( errno ) ));
321 0 : }
322 0 : }
323 :
324 0 : if( FD_UNLIKELY( -1==close( prog_fd ) ) ) FD_LOG_ERR(( "close(%d) failed (%i-%s)", xsk_map_fd, errno, fd_io_strerror( errno ) ));
325 :
326 0 : return (fd_xdp_fds_t){
327 0 : .xsk_map_fd = xsk_map_fd,
328 0 : .prog_link_fd = prog_link_fd,
329 0 : };
330 0 : }
|