Line data Source code
1 : #define _GNU_SOURCE
2 : #include <errno.h>
3 : #include <netinet/in.h>
4 : #include <sys/socket.h>
5 : #include <sys/stat.h>
6 : #include <sys/uio.h>
7 : #include "fd_udpsock.h"
8 : #include "../../util/net/fd_eth.h"
9 : #include "../../util/net/fd_ip4.h"
10 : #include "../../util/net/fd_udp.h"
11 :
12 : /* FD_UDPSOCK_FRAME_ALIGN is the alignment of a packet frame */
13 :
14 0 : #define FD_UDPSOCK_FRAME_ALIGN (16UL)
15 0 : #define FD_UDPSOCK_HEADROOM (14UL+20UL+8UL) /* Ethernet, IPv4, UDP */
16 :
17 : struct fd_udpsock {
18 : fd_aio_t aio_self; /* aio provided by udpsock */
19 : fd_aio_t const * aio_rx; /* aio provided by receiver */
20 :
21 : int fd; /* file descriptor of actual socket */
22 :
23 : /* Mock Ethernet fields */
24 :
25 : uchar eth_self_addr[ 6 ];
26 : uchar eth_peer_addr[ 6 ];
27 :
28 : /* Mock UDP/IPv4 fields */
29 :
30 : uint ip_self_addr; /* network byte order */
31 : ushort udp_self_port; /* little endian */
32 :
33 : /* Pointers to variable length data structures */
34 :
35 : ulong rx_cnt;
36 : struct mmsghdr * rx_msg;
37 : struct iovec * rx_iov;
38 : void * rx_frame;
39 : fd_aio_pkt_info_t * rx_pkt;
40 : ulong tx_cnt;
41 : struct mmsghdr * tx_msg;
42 : struct iovec * tx_iov;
43 : void * tx_frame;
44 :
45 : /* Variable length data structures follow ...
46 :
47 : struct mmsghdr [ rx_cnt ] (rx)
48 : struct mmsghdr [ tx_cnt ] (tx)
49 : struct iovec [ rx_cnt ] (rx)
50 : struct iovec [ tx_cnt ] (tx)
51 : uchar [ mtu ][ rx_cnt ] (rx)
52 : fd_aio_pkt_t [ rx_cnt ] (rx)
53 : struct sockaddr_in[ rx_cnt ] (rx)
54 : struct sockaddr_in[ tx_cnt ] (tx) */
55 : };
56 :
57 : /* Forward declaration */
58 : static int
59 : fd_udpsock_send( void * ctx,
60 : fd_aio_pkt_info_t const * batch,
61 : ulong batch_cnt,
62 : ulong * opt_batch_idx,
63 : int flush );
64 :
65 : FD_FN_CONST ulong
66 0 : fd_udpsock_align( void ) {
67 0 : return alignof(fd_udpsock_t);
68 0 : }
69 :
70 : FD_FN_CONST ulong
71 : fd_udpsock_footprint( ulong mtu,
72 : ulong rx_pkt_cnt,
73 0 : ulong tx_pkt_cnt ) {
74 :
75 0 : if( FD_UNLIKELY( ( mtu ==0UL )
76 0 : | ( mtu <=FD_UDPSOCK_HEADROOM )
77 0 : | ( rx_pkt_cnt==0UL )
78 0 : | ( tx_pkt_cnt==0UL ) ) )
79 0 : return 0UL;
80 :
81 0 : ulong tot_pkt_cnt = rx_pkt_cnt + tx_pkt_cnt;
82 0 : ulong aligned_mtu = fd_ulong_align_up( mtu, FD_UDPSOCK_FRAME_ALIGN );
83 :
84 0 : return
85 0 : FD_LAYOUT_FINI ( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND(
86 0 : FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND(
87 0 : FD_LAYOUT_APPEND( FD_LAYOUT_INIT,
88 0 : alignof( fd_udpsock_t ), sizeof( fd_udpsock_t ) ),
89 0 : alignof( struct mmsghdr ), tot_pkt_cnt*sizeof( struct mmsghdr ) ),
90 0 : alignof( struct iovec ), tot_pkt_cnt*sizeof( struct iovec ) ),
91 0 : FD_UDPSOCK_FRAME_ALIGN, rx_pkt_cnt *aligned_mtu ),
92 0 : alignof( fd_aio_pkt_info_t ), rx_pkt_cnt *sizeof( fd_aio_pkt_info_t ) ),
93 0 : alignof( struct sockaddr_in ), tot_pkt_cnt*sizeof( struct sockaddr_in ) ),
94 0 : FD_UDPSOCK_ALIGN );
95 0 : }
96 :
97 : void *
98 : fd_udpsock_new( void * shmem,
99 : ulong mtu,
100 : ulong rx_pkt_cnt,
101 0 : ulong tx_pkt_cnt ) {
102 :
103 0 : if( FD_UNLIKELY( !shmem ) ) {
104 0 : FD_LOG_WARNING(( "NULL shmem" ));
105 0 : return NULL;
106 0 : }
107 :
108 0 : ulong laddr = (ulong)shmem;
109 0 : if( FD_UNLIKELY( !fd_ulong_is_aligned( laddr, fd_udpsock_align() ) ) ) {
110 0 : FD_LOG_WARNING(( "misaligned shmem" ));
111 0 : return NULL;
112 0 : }
113 0 : ulong footprint = fd_udpsock_footprint( mtu, rx_pkt_cnt, tx_pkt_cnt );
114 0 : if( FD_UNLIKELY( !footprint ) ) {
115 0 : FD_LOG_WARNING(( "invalid footprint for config" ));
116 0 : return NULL;
117 0 : }
118 0 : laddr += FD_LAYOUT_INIT;
119 :
120 : /* Allocate main struct */
121 :
122 0 : laddr = fd_ulong_align_up( laddr, alignof(fd_udpsock_t) );
123 0 : fd_udpsock_t * sock = (fd_udpsock_t *)laddr;
124 0 : memset( sock, 0, sizeof(fd_udpsock_t) );
125 0 : sock->fd = -1;
126 0 : sock->rx_cnt = rx_pkt_cnt;
127 0 : sock->tx_cnt = tx_pkt_cnt;
128 0 : laddr += sizeof(fd_udpsock_t);
129 :
130 0 : ulong tot_pkt_cnt = rx_pkt_cnt + tx_pkt_cnt;
131 0 : ulong aligned_mtu = fd_ulong_align_up( mtu, FD_UDPSOCK_FRAME_ALIGN );
132 :
133 : /* Set defaults for mock network headers */
134 :
135 0 : memcpy( sock->eth_self_addr, (uchar[6]){0x00, 0x00, 0x5e, 0x00, 0x53, 0x42}, 6 );
136 0 : memcpy( sock->eth_peer_addr, (uchar[6]){0x00, 0x00, 0x5e, 0x00, 0x53, 0x43}, 6 );
137 :
138 0 : sock->ip_self_addr = FD_IP4_ADDR( 0, 0, 0, 0 );
139 0 : sock->udp_self_port = 0;
140 :
141 0 : sock->aio_self = (fd_aio_t){
142 0 : .ctx = sock,
143 0 : .send_func = fd_udpsock_send
144 0 : };
145 :
146 : /* Allocate variable-length data structures */
147 :
148 0 : laddr = fd_ulong_align_up( laddr, alignof(struct mmsghdr) );
149 0 : struct mmsghdr * msg = (struct mmsghdr *)laddr;
150 0 : sock->rx_msg = msg;
151 0 : sock->tx_msg = msg + rx_pkt_cnt;
152 0 : laddr += tot_pkt_cnt*sizeof(struct mmsghdr);
153 :
154 0 : laddr = fd_ulong_align_up( laddr, alignof(struct iovec) );
155 0 : struct iovec * iov = (struct iovec *)laddr;
156 0 : sock->rx_iov = iov;
157 0 : sock->tx_iov = iov + rx_pkt_cnt;
158 0 : laddr += tot_pkt_cnt*sizeof(struct iovec);
159 :
160 0 : laddr = fd_ulong_align_up( laddr, FD_UDPSOCK_FRAME_ALIGN );
161 0 : ulong frame_base = laddr;
162 0 : sock->rx_frame = (void *)laddr;
163 0 : sock->tx_frame = (void *)(laddr + aligned_mtu*rx_pkt_cnt);
164 0 : laddr += rx_pkt_cnt*aligned_mtu;
165 :
166 0 : laddr = fd_ulong_align_up( laddr, alignof(fd_aio_pkt_info_t) );
167 0 : fd_aio_pkt_info_t * pkt = (fd_aio_pkt_info_t *)laddr;
168 0 : sock->rx_pkt = pkt;
169 0 : laddr += rx_pkt_cnt*sizeof(fd_aio_pkt_info_t);
170 :
171 0 : laddr = fd_ulong_align_up( laddr, alignof(struct sockaddr_in) );
172 0 : struct sockaddr_in * saddrs = (struct sockaddr_in *)laddr;
173 0 : laddr += tot_pkt_cnt*sizeof(struct sockaddr_in);
174 :
175 : /* Prepare iovec and msghdr buffers */
176 :
177 0 : for( ulong i=0; i<rx_pkt_cnt; i++ ) {
178 0 : iov[i].iov_base = (void *)(frame_base + i*aligned_mtu + FD_UDPSOCK_HEADROOM);
179 0 : iov[i].iov_len = aligned_mtu - FD_UDPSOCK_HEADROOM;
180 0 : msg[i].msg_hdr.msg_iov = &iov[i];
181 0 : msg[i].msg_hdr.msg_iovlen = 1;
182 0 : msg[i].msg_hdr.msg_name = &saddrs[i];
183 0 : msg[i].msg_hdr.msg_namelen = sizeof(struct sockaddr_in);
184 0 : }
185 0 : for( ulong i=rx_pkt_cnt; i<tot_pkt_cnt; i++ ) {
186 0 : msg[i].msg_hdr.msg_iov = &iov[i];
187 0 : msg[i].msg_hdr.msg_iovlen = 1;
188 0 : msg[i].msg_hdr.msg_name = &saddrs[i];
189 0 : msg[i].msg_hdr.msg_namelen = sizeof(struct sockaddr_in);
190 0 : }
191 :
192 0 : return shmem;
193 0 : }
194 :
195 : fd_udpsock_t *
196 : fd_udpsock_join( void * shsock,
197 0 : int fd ) {
198 :
199 0 : if( FD_UNLIKELY( !shsock ) ) {
200 0 : FD_LOG_WARNING(( "NULL shsock" ));
201 0 : return NULL;
202 0 : }
203 :
204 0 : fd_udpsock_t * sock = (fd_udpsock_t *)shsock;
205 0 : sock->fd = fd;
206 :
207 : /* Extract socket address */
208 0 : struct sockaddr addr;
209 0 : socklen_t addrlen = sizeof(addr);
210 0 : int res = getsockname( fd, &addr, &addrlen );
211 0 : if( FD_UNLIKELY( res < 0 ) ) {
212 0 : FD_LOG_WARNING(( "getsockname(%d) failed (%i-%s)", fd, errno, fd_io_strerror( errno ) ));
213 0 : return NULL;
214 0 : }
215 0 : if( FD_UNLIKELY( addr.sa_family != AF_INET ) ) {
216 0 : FD_LOG_WARNING(( "getsockname(%d) returned non-IPv4 address", fd ));
217 0 : return NULL;
218 0 : }
219 0 : struct sockaddr_in const * sin = (struct sockaddr_in const *)fd_type_pun_const( &addr );
220 0 : sock->ip_self_addr = sin->sin_addr.s_addr;
221 0 : sock->udp_self_port = fd_ushort_bswap( sin->sin_port );
222 :
223 0 : return sock;
224 0 : }
225 :
226 : void *
227 0 : fd_udpsock_leave( fd_udpsock_t * sock ) {
228 0 : if( FD_UNLIKELY( !sock ) ) {
229 0 : FD_LOG_WARNING(( "NULL sock" ));
230 0 : return NULL;
231 0 : }
232 0 : sock->fd = -1;
233 0 : return (void *)sock;
234 0 : }
235 :
236 : void *
237 0 : fd_udpsock_delete( void * shsock ) {
238 0 : if( FD_UNLIKELY( !shsock ) ) {
239 0 : FD_LOG_WARNING(( "NULL shsock" ));
240 0 : return NULL;
241 0 : }
242 0 : return shsock;
243 0 : }
244 :
245 : void
246 : fd_udpsock_set_rx( fd_udpsock_t * sock,
247 0 : fd_aio_t const * aio ) {
248 0 : sock->aio_rx = aio;
249 0 : }
250 :
251 : FD_FN_CONST fd_aio_t const *
252 0 : fd_udpsock_get_tx( fd_udpsock_t * sock ) {
253 0 : return &sock->aio_self;
254 0 : }
255 :
256 : void
257 0 : fd_udpsock_service( fd_udpsock_t * sock ) {
258 : /* Receive packets into iovecs */
259 :
260 0 : int fd = sock->fd;
261 0 : long res = recvmmsg( fd, sock->rx_msg, (uint)sock->rx_cnt, MSG_DONTWAIT, NULL );
262 0 : if( FD_UNLIKELY( res<0 ) ) {
263 0 : if( FD_LIKELY( (errno==EAGAIN) | (errno==EWOULDBLOCK) ) )
264 0 : return;
265 0 : FD_LOG_WARNING(( "recvmmsg(%d) failed (%i-%s)", fd, errno, fd_io_strerror( errno ) ));
266 0 : return;
267 0 : }
268 0 : ulong msg_cnt = (ulong)res;
269 :
270 : /* Create fake headers and prepare an aio batch */
271 :
272 0 : for( ulong i=0UL; i<msg_cnt; i++ ) {
273 0 : struct sockaddr_in const * addr = (struct sockaddr_in const *)sock->rx_msg[i].msg_hdr.msg_name;
274 :
275 0 : void * frame_base = (void *)( (ulong)sock->rx_iov[i].iov_base - FD_UDPSOCK_HEADROOM );
276 0 : fd_eth_hdr_t * eth = (fd_eth_hdr_t *)frame_base;
277 0 : memcpy( eth->dst, sock->eth_self_addr, 6 );
278 0 : memcpy( eth->src, sock->eth_peer_addr, 6 );
279 0 : eth->net_type = fd_ushort_bswap( FD_ETH_HDR_TYPE_IP );
280 :
281 : /* copy to avoid alignment issues */
282 0 : uchar saddr[4];
283 0 : uchar daddr[4];
284 0 : memcpy( saddr, &addr->sin_addr.s_addr, 4 );
285 0 : memcpy( daddr, &sock->ip_self_addr, 4 );
286 :
287 0 : fd_ip4_hdr_t * ip4 = (fd_ip4_hdr_t *)((ulong)eth + sizeof(fd_eth_hdr_t));
288 0 : *ip4 = (fd_ip4_hdr_t) {
289 0 : .verihl = FD_IP4_VERIHL(4,5),
290 0 : .tos = 0,
291 0 : .net_tot_len = (ushort)( (ulong)sock->rx_msg[i].msg_len
292 0 : + sizeof(fd_ip4_hdr_t)
293 0 : + sizeof(fd_udp_hdr_t) ),
294 0 : .net_id = 0,
295 0 : .net_frag_off = 0,
296 0 : .ttl = 64,
297 0 : .protocol = FD_IP4_HDR_PROTOCOL_UDP,
298 0 : .check = 0,
299 0 : .saddr_c = { saddr[0], saddr[1], saddr[2], saddr[3] },
300 0 : .daddr_c = { daddr[0], daddr[1], daddr[2], daddr[3] }
301 0 : };
302 :
303 0 : fd_ip4_hdr_bswap( ip4 ); /* convert to "network" byte order */
304 0 : ip4->check = fd_ip4_hdr_check_fast( ip4 );
305 :
306 : /* Create UDP header with network byte order */
307 0 : fd_udp_hdr_t * udp = (fd_udp_hdr_t *)((ulong)ip4 + sizeof(fd_ip4_hdr_t));
308 0 : *udp = (fd_udp_hdr_t) {
309 0 : .net_sport = (ushort)addr->sin_port,
310 0 : .net_dport = (ushort)fd_ushort_bswap( sock->udp_self_port ),
311 0 : .net_len = (ushort)fd_ushort_bswap( (ushort)( (ulong)sock->rx_msg[i].msg_len + sizeof(fd_udp_hdr_t) ) ),
312 0 : .check = 0
313 0 : };
314 :
315 0 : sock->rx_pkt[i] = (fd_aio_pkt_info_t) {
316 0 : .buf = frame_base,
317 0 : .buf_sz = (ushort)( FD_UDPSOCK_HEADROOM + (ulong)sock->rx_msg[i].msg_len )
318 0 : };
319 0 : }
320 :
321 : /* Dispatch to recipient ignoring errors */
322 :
323 0 : fd_aio_send( sock->aio_rx, sock->rx_pkt, msg_cnt, NULL, 0 );
324 0 : }
325 :
326 : static int
327 : fd_udpsock_send( void * ctx,
328 : fd_aio_pkt_info_t const * batch,
329 : ulong batch_cnt,
330 : ulong * opt_batch_idx,
331 0 : int flush ) {
332 :
333 0 : fd_udpsock_t * sock = (fd_udpsock_t *)ctx;
334 :
335 0 : if( FD_UNLIKELY( batch_cnt == 0 ) )
336 0 : return FD_AIO_SUCCESS;
337 0 : ulong send_cnt = fd_ulong_if( batch_cnt > sock->tx_cnt, sock->tx_cnt, batch_cnt );
338 :
339 0 : ulong _dummy_batch_idx;
340 0 : opt_batch_idx = opt_batch_idx ? opt_batch_idx : &_dummy_batch_idx;
341 :
342 : /* Set up iovecs */
343 :
344 0 : ulong iov_idx = 0UL;
345 0 : for( ulong i=0UL; i<send_cnt; i++ ) {
346 0 : if( FD_LIKELY( batch[i].buf_sz >= sizeof(fd_eth_hdr_t)+sizeof(fd_ip4_hdr_t) ) ) {
347 : /* skip packets that aren't IP (like ARP) */
348 : /* TODO consider doing something with ARP probes here
349 : it's an indication that the ARP table doesn't have an ARP entry for
350 : the given IP */
351 0 : fd_eth_hdr_t * eth = (fd_eth_hdr_t *)( (ulong)batch[i].buf );
352 0 : if( FD_UNLIKELY( eth->net_type != htons( 0x0800 ) ) ) continue;
353 :
354 0 : fd_ip4_hdr_t * ip4 = (fd_ip4_hdr_t *)( (ulong)batch[i].buf + sizeof(fd_eth_hdr_t) );
355 0 : fd_ip4_hdr_bswap( ip4 ); /* convert to host byte order */
356 0 : uint daddr = 0;
357 0 : memcpy( &daddr, ip4->daddr_c, 4 );
358 0 : fd_udp_hdr_t * udp = (fd_udp_hdr_t *)( (ulong)ip4 + (ulong)FD_IP4_GET_LEN(*ip4) );
359 0 : fd_udp_hdr_bswap( udp ); /* convert to host byte order */
360 0 : ushort dport = udp->net_dport;
361 :
362 0 : void * payload = (void *)( (ulong)udp + sizeof(fd_udp_hdr_t) );
363 0 : sock->tx_iov[iov_idx].iov_base = payload;
364 0 : sock->tx_iov[iov_idx].iov_len = batch[i].buf_sz - (ulong)( (ulong)payload - (ulong)batch[i].buf );
365 0 : struct sockaddr_in * addr = (struct sockaddr_in *)sock->tx_msg[iov_idx].msg_hdr.msg_name;
366 0 : addr->sin_addr = (struct in_addr) { .s_addr = daddr };
367 0 : addr->sin_port = (ushort)fd_ushort_bswap( (ushort)dport );
368 :
369 0 : iov_idx++;
370 0 : }
371 0 : }
372 0 : int fd = sock->fd;
373 0 : long res = sendmmsg( fd, sock->tx_msg, (uint)iov_idx, flush ? 0 : MSG_DONTWAIT );
374 0 : if( FD_UNLIKELY( res<0 ) ) {
375 0 : *opt_batch_idx = 0UL;
376 0 : if( FD_LIKELY( (errno==EAGAIN) | (errno==EWOULDBLOCK) ) )
377 0 : return FD_AIO_ERR_AGAIN;
378 0 : FD_LOG_WARNING(( "sendmmsg(%d) failed (%i-%s)", fd, errno, fd_io_strerror( errno ) ));
379 0 : return FD_AIO_ERR_INVAL;
380 0 : }
381 0 : ulong sent_cnt = (ulong)res;
382 :
383 0 : if( FD_UNLIKELY( iov_idx < sent_cnt ) ) {
384 0 : *opt_batch_idx = iov_idx;
385 0 : return FD_AIO_ERR_AGAIN;
386 0 : }
387 0 : return FD_AIO_SUCCESS;
388 0 : }
389 :
390 : uint
391 0 : fd_udpsock_get_ip4_address( fd_udpsock_t const * sock ) {
392 0 : return sock->ip_self_addr;
393 0 : }
394 :
395 : uint
396 0 : fd_udpsock_get_listen_port( fd_udpsock_t const * sock ) {
397 0 : return sock->udp_self_port;
398 0 : }
|