Line data Source code
1 : #ifndef HEADER_fd_src_util_cstr_fd_cstr_h
2 : #define HEADER_fd_src_util_cstr_fd_cstr_h
3 :
4 : /* APIs for manipulating '\0'-terminated character strings ("cstr") */
5 :
6 : #include "../bits/fd_bits.h"
7 :
8 : FD_PROTOTYPES_BEGIN
9 :
10 : /* cstr input *********************************************************/
11 :
12 : /* fd_cstr_to_T converts the cstr pointed at by s into a T and returns
13 : its value. Caller promises s is non-NULL and points at a cstr.
14 :
15 : Note fd_cstr_to_cstr just returns s. As such the lifetime of the
16 : returned pointer is the lifetime s and ownership model of the
17 : underlying s is defined by the application.
18 :
19 : fd_cstr_to_char just returns the first character of the cstr (if cstr
20 : is the empty string, this will be the '\0' character ... otherwise,
21 : it will be a normal string character). As char do not have a
22 : consistent interpretation between platforms due to issues with the
23 : language standard itself, the value here should just be treated as a
24 : character and not an integer. Use fd_cstr_schar/fd_cstr_uchar if you
25 : need to treat a char as an integer.
26 :
27 : fd_cstr_to_cstr and fd_cstr_to_char exist primarily for type system
28 : completeness / facilitate various generic programming practices.
29 :
30 : The integer converters work in the strtol sense with base 0 (and thus
31 : ignore leading whitespace, handle leading signs and assume octal if
32 : the body is prefixed with 0, hexadecimal if prefixed with 0x and
33 : decimal otherwise). */
34 :
35 : FD_FN_CONST char const * fd_cstr_to_cstr ( char const * s );
36 : FD_FN_PURE char fd_cstr_to_char ( char const * s );
37 : FD_FN_PURE schar fd_cstr_to_schar ( char const * s );
38 : FD_FN_PURE short fd_cstr_to_short ( char const * s );
39 : FD_FN_PURE int fd_cstr_to_int ( char const * s );
40 : FD_FN_PURE long fd_cstr_to_long ( char const * s );
41 : FD_FN_PURE uchar fd_cstr_to_uchar ( char const * s );
42 : FD_FN_PURE ushort fd_cstr_to_ushort( char const * s );
43 : FD_FN_PURE uint fd_cstr_to_uint ( char const * s );
44 : FD_FN_PURE ulong fd_cstr_to_ulong ( char const * s );
45 : FD_FN_PURE float fd_cstr_to_float ( char const * s );
46 : #if FD_HAS_DOUBLE
47 : FD_FN_PURE double fd_cstr_to_double( char const * s );
48 : #endif
49 :
50 : /* fd_cstr_to_ulong_octal is the same as fd_cstr_to_ulong but assumes s
51 : points is octal. This is mostly used when dealing parsing UNIX style
52 : file permissions. */
53 :
54 : FD_FN_PURE ulong fd_cstr_to_ulong_octal( char const * s );
55 :
56 : /* fd_cstr_to_ulong_seq populates seq (which has room for seq max items)
57 : with the sequenced specified by the given cstr. Sequences are a
58 : comma separated list of ranges (e.g. "R0,R1,R2"). The ranges
59 : themselves can be themselves be individual integers (e.g. "5") or a
60 : simple range (e.g. "4-8", includes both endpoints, stop should be at
61 : least start), a range with a skip (e.g. "1-10/3" or "1-10:3", stop
62 : should be at least start and stride should be positive). Ignores
63 : internal whitespace. Robust against overflow / wrapping of ranges
64 : against ULONG_MAX. Items may appear in multiple times and sequences
65 : can have an arbitrary order. Caller promises seq is non-NULL if max
66 : is non-zero. Returns 0 on NULL or malformed cstr or empty sequence
67 : (seq contents might have been arbitrarily clobbered on a malformed
68 : cstr). */
69 :
70 : ulong /* Actual sequence length, if greater than seq_max returned sequence truncated. */
71 : fd_cstr_to_ulong_seq( char const * cstr, /* String to parse, NULL returns 0 */
72 : ulong * seq, /* Indexed [0,max), elements [0,min(actual sequence length,seq_max)) populated with
73 : the leading portion of the seq. Any remaining elements of seq are untouched. */
74 : ulong seq_max ); /* Maximum sequence length */
75 :
76 : /* fd_cstr_hash hashes the cstr pointed to by key to a ulong.
77 : fd_cstr_hash_append updates the hash value (it will be as though the
78 : fd_cstr_hash was called on the string concatenation of the all the
79 : keys provided to hash / hash append in order). Treats key==NULL the
80 : same as the empty string "". Yields identical cross platform results
81 : regardless of how the platform treats the sign of char. Based on one
82 : of the djb2 hash variants (public domain).
83 :
84 : FIXME: This is simple and fast and pretty good practically for string
85 : hashing but more robust and faster algos are probably out there. */
86 :
87 : FD_FN_PURE static inline ulong
88 : fd_cstr_hash_append( ulong hash,
89 10463066 : char const * key ) {
90 10463068 : if( FD_LIKELY( key ) ) {
91 10463068 : uchar const * p = (uchar const *)key;
92 404860295 : for(;;) {
93 404860295 : ulong c = p[0];
94 404860295 : if( FD_UNLIKELY( !c ) ) break;
95 394397221 : hash = (hash*33UL) ^ c;
96 394397221 : p++;
97 394397221 : }
98 10463068 : }
99 10463066 : return hash;
100 10463066 : }
101 :
102 291 : FD_FN_PURE static inline ulong fd_cstr_hash( char const * key ) { return fd_cstr_hash_append( 5381UL, key ); }
103 :
104 : /* fd_cstr_casecmp is equivalent to strcasecmp but doesn't require
105 : FD_HAS_HOSTED (POSIX) support. */
106 :
107 : FD_FN_PURE int
108 : fd_cstr_casecmp( char const * a,
109 : char const * b );
110 :
111 : /* fd_cstr_nlen is equivalent to strnlen but doesn't require
112 : FD_HAS_HOSTED (POSIX) support. */
113 :
114 : FD_FN_PURE ulong
115 : fd_cstr_nlen( char const * s,
116 : ulong m );
117 :
118 : /* fd_cstr_ncpy is a safe version of strncpy. d is the destination cstr
119 : and s is the source cstr. d and s should not overlap. Assumes d has
120 : space for up to m bytes. Always returns d. All bytes of d will be
121 : initialized. Further, if m is not zero, d will _always_ be properly
122 : '\0' terminated.
123 :
124 : Specifically, if m is 0 (i.e. d has zero bytes of storage), this
125 : returns d. Otherwise, if s is NULL, this will zero out all m bytes
126 : of d and return d. Otherwise, this will copy up to m-1 of the
127 : leading non-zero bytes in s into d. All remaining bytes of d (there
128 : will be at least 1) will be initialized to zero. */
129 :
130 : char *
131 : fd_cstr_ncpy( char * d,
132 : char const * s,
133 : ulong m );
134 :
135 : /* cstr output ********************************************************/
136 :
137 : /* fd_cstr_printf printf a cstr into the sz byte memory region pointed
138 : to by buf. Always returns buf.
139 :
140 : If buf is non-NULL and sz is non-zero, on return, buf will point to a
141 : cstr such that strlen(buf)<sz. That is, bytes [0,strlen(buf)] will
142 : be non-'\0', byte strlen(buf) will be '\0' and bytes (len,sz) will be
143 : unchanged. If more than sz bytes are needed to hold the requested
144 : cstr, the cstr will be truncated to its leading bytes such that
145 : strlen(buf)==sz-1. If opt_len is non-NULL, *opt_len will be set to
146 : the strlen(buf) on return.
147 :
148 : buf==NULL and/or sz==0UL are treated as a no-op. (If opt_len is
149 : non-NULL *opt_len wll be 0UL on return ... this is debatable though
150 : given the strlen(buf) property above. Might be better to this case
151 : as U.B., or abort if opt_len is requested when buf==NULL and sz==NULL
152 : or return ULONG_MAX in opt_len (-1) to indicate ill defined usage or
153 : ...) */
154 :
155 : char *
156 : fd_cstr_printf( char * buf,
157 : ulong sz,
158 : ulong * opt_len,
159 : char const * fmt, ... ) __attribute__((format(printf,4,5)));
160 :
161 : /* fd_cstr_printf_check is the same as fd_cstr_printf except that it
162 : returns 1 if the entire cstr, including the NUL terminating
163 : character was written to buf and 0 otherwise.
164 :
165 : If the cstr was truncated, or there was an error in the printf
166 : formatting process, 0 will be returned. Otherwise, on success, 1
167 : will be returned. If zero bytes are written to buf because the
168 : format string is empty, the return value will be 1. */
169 :
170 : int
171 : fd_cstr_printf_check( char * buf,
172 : ulong sz,
173 : ulong * opt_len,
174 : char const * fmt, ... ) __attribute__((format(printf,4,5)));
175 :
176 : /* fd_cstr_init start writing a cstr into buf. Returns where the first
177 : character of the cstr should be written (==buf). */
178 :
179 2749207 : static inline char * fd_cstr_init( char * buf ) { return buf; }
180 :
181 : /* fd_cstr_fini finished writing a cstr to buf. Assumes p is valid
182 : (non-NULL and room for the terminating '\0'). At this point, the buf
183 : passed to fd_cstr_init will be properly '\0' terminated. */
184 :
185 2749363 : static inline void fd_cstr_fini( char * p ) { *p = '\0'; }
186 :
187 : /* fd_cstr_append_char append character c to cstr. Assumes p is valid
188 : (non-NULL and room for at least this char and a final terminating
189 : '\0') and c is not '\0' */
190 :
191 16844216 : static inline char * fd_cstr_append_char( char * p, char c ) { *(p++) = c; return p; }
192 :
193 : /* fd_cstr_append_text appends n characters of text pointed to by t to
194 : p. Assumes p is valid (non-NULL and room for at least n characters
195 : and a final terminating '\0') and t is valid (points to n consecutive
196 : non-'\0' characters). n is zero is fine. */
197 :
198 : static inline char *
199 : fd_cstr_append_text( char * p,
200 : char const * t,
201 2400757 : ulong n ) {
202 2400757 : fd_memcpy( p, t, n );
203 2400757 : return p + n;
204 2400757 : }
205 :
206 : /* fd_cstr_append_cstr appends the cstr pointed to by s to p. Assumes p
207 : is valid (non-NULL and room for at least strlen( s ) characters and a
208 : final terminating '\0'). s==NULL is treated as a no-op. */
209 :
210 : static inline char *
211 : fd_cstr_append_cstr( char * p,
212 49215 : char const * s ) {
213 49215 : if( FD_UNLIKELY( !s ) ) return p;
214 49215 : ulong n = strlen( s );
215 49215 : fd_memcpy( p, s, n );
216 49215 : return p + n;
217 49215 : }
218 :
219 : /* fd_cstr_append_cstr_safe appends up to n chars of the cstr pointed
220 : to by to p. Assumes p is valid (non-NULL and room for at least n
221 : characters and a final terminating '\0'). s==NULL is treated as a
222 : no-op. */
223 :
224 : static inline char *
225 : fd_cstr_append_cstr_safe( char * p,
226 : char const * s,
227 9976 : ulong n ) {
228 9976 : if( FD_UNLIKELY( !s ) ) return p;
229 9976 : ulong l = fd_ulong_min( strlen( s ), n );
230 9976 : fd_memcpy( p, s, l );
231 9976 : return p + l;
232 9976 : }
233 :
234 : /* fd_cstr_append_printf appends the printf of the fmt string into p.
235 : Assumes p is valid (non-NULL and room for printf characters and a
236 : final terminating '\0'). */
237 :
238 : char *
239 : fd_cstr_append_printf( char * p,
240 : char const * fmt, ... ) __attribute__((format(printf,2,3)));
241 :
242 : /* fd_cstr_append_ulong_as_text pretty prints the ulong into p (and
243 : similarly for the other unsigned integer types). Assumes p is valid
244 : (non-NULL and room for at least n characters and a final terminating
245 : '\0'), x is small enough to pretty print to n chars (which implies
246 : that n is at least 1). ws is the character to left pad the converted
247 : value with. pfx is prefix character to use (e.g. '+', '-'), '\0'
248 : indicates no prefix. If a prefix is requested, it will be
249 : immediately before the most significant converted character. */
250 :
251 : static inline char *
252 : fd_cstr_append_uint_as_text( char * p,
253 : char ws,
254 : char pm,
255 : uint x,
256 14393028 : ulong n ) {
257 14393028 : char * p0 = p;
258 14393028 : p += n;
259 14393028 : char * q = p;
260 26388392 : do { uint d = x % 10U; x /= 10U; *(--q) = (char)( d + (uint)'0' ); } while( x );
261 14393028 : if( pm ) *(--q) = pm;
262 21589547 : while( p0<q ) *(p0++) = ws;
263 14393028 : return p;
264 14393028 : }
265 :
266 : static inline char *
267 : fd_cstr_append_ulong_as_text( char * p,
268 : char ws,
269 : char pm,
270 : ulong x,
271 840 : ulong n ) {
272 840 : char * p0 = p;
273 840 : p += n;
274 840 : char * q = p;
275 7170 : do { ulong d = x % 10UL; x /= 10UL; *(--q) = (char)( d + (ulong)'0' ); } while( x );
276 840 : if( pm ) *(--q) = pm;
277 1092 : while( p0<q ) *(p0++) = ws;
278 840 : return p;
279 840 : }
280 :
281 : #if FD_HAS_INT128
282 :
283 : static inline char *
284 : fd_cstr_append_uint128_as_text( char * p,
285 : char ws,
286 : char pm,
287 : uint128 x,
288 960 : ulong n ) {
289 960 : char * p0 = p;
290 960 : p += n;
291 960 : char * q = p;
292 19200 : do { uint128 d = x % (uint128)10UL; x /= (uint128)10UL; *(--q) = (char)( d + (uint128)'0' ); } while( x );
293 960 : if( pm ) *(--q) = pm;
294 1440 : while( p0<q ) *(p0++) = ws;
295 960 : return p;
296 960 : }
297 :
298 : #endif
299 :
300 : static inline char *
301 : fd_cstr_append_uchar_as_text ( char * p,
302 : char ws,
303 : char pm,
304 : uchar x,
305 96 : ulong n ) {
306 96 : return fd_cstr_append_uint_as_text( p, ws, pm, (uint)x, n );
307 96 : }
308 :
309 : static inline char *
310 : fd_cstr_append_ushort_as_text( char * p,
311 : char ws,
312 : char pm,
313 : ushort x,
314 150 : ulong n ) {
315 150 : return fd_cstr_append_uint_as_text( p, ws, pm, (uint)x, n );
316 150 : }
317 :
318 : /* fd_cstr_append_fxp10_as_text same as the above but for the decimal
319 : fixed point value:
320 : x / 10^f
321 : Assumes p is valid (non-NULL and room for at least n characters and a
322 : final terminating '\0'), x / 10^f is not too large to fit within n
323 : characters (which implies that n is at least f+2). ws is the
324 : character to left pad the converted value with. pfx is prefix
325 : character to use (e.g. '+', '-'), '\0' indicates no prefix. If a
326 : prefix is requested, it will be immediately before the most
327 : significant converted character. */
328 :
329 : FD_FN_UNUSED static char * /* Work around -Winline */
330 : fd_cstr_append_fxp10_as_text( char * p,
331 : char ws,
332 : char pm,
333 : ulong f,
334 : ulong x,
335 2400776 : ulong n ) {
336 2400776 : char * p0 = p;
337 2400776 : p += n;
338 2400776 : char * q = p;
339 23998670 : while( f ) { ulong d = x % 10UL; x /= 10UL; *(--q) = (char)( d + (ulong)'0' ); f--; }
340 2400776 : *(--q) = '.';
341 4809753 : do { ulong d = x % 10UL; x /= 10UL; *(--q) = (char)( d + (ulong)'0' ); } while( x );
342 2400776 : if( pm ) *(--q) = pm;
343 2403378 : while( p0<q ) *(p0++) = ws;
344 2400776 : return p;
345 2400776 : }
346 :
347 : /* fd_cstr_tokenize tokenizes the cstr of the form whose first
348 : byte is pointed to by cstr:
349 :
350 : [WS][TOKEN 0][DELIM][WS][TOKEN 1][DELIM]...[WS][TOKEN N]{[DELIM][WS][NUL],[NUL]}
351 :
352 : in-place, into:
353 :
354 : [WS][TOKEN 0][NUL][WS][TOKEN 1][NUL]...[WS][TOKEN tok_cnt-1][NUL]
355 :
356 : and returns tok_cnt.
357 :
358 : Further, on return, tok[i] for i in [0,min(tok_cnt,tok_max)) where
359 : tok_cnt is the number of tokens in cstr will point to the first
360 : byte of each token. Due to the tokenization, each one of these will
361 : be properly '\0' terminated.
362 :
363 : Above, [WS] is a sequence of zero or more whitespace characters,
364 : [TOKEN *] are a sequence of zero or more non-delim and non-NUL
365 : characters and delim is assumed to be a non-NUL non-whitespace
366 : character (e.g. ',').
367 :
368 : As such:
369 : - The original cstr is clobbered by this call.
370 : - tok[*] point to a properly terminated cstr into the original cstr
371 : on return. They thus have the same lifetime issues as the original
372 : cstr.
373 : - If tok_cnt > tok_max, tok wasn't large enough to hold all the
374 : tokens found in the cstr. Only the first max are available in
375 : tok[*] (the entire string was still tokenized though).
376 : - Found tokens will not have any leading whitespace.
377 : - Found tokens might have internal or trailing whitespace.
378 : - Zero length tokens are possible. E.g. assuming delim==':', the cstr
379 : "a: b::d: :f" has the tokens: "a", "b", "", "d", "", "f".
380 : - If the final token is zero length, it should use an explicit
381 : delimiter. E.g. assuming delim=='|':
382 : "a|b" has tokens "a", "b"
383 : "a|b|" has tokens "a", "b"
384 : "a|b| " has tokens "a", "b"
385 : "a|b||" has tokens "a", "b", ""
386 : "a|b| |" has tokens "a", "b", ""
387 : "a|b| | " has tokens "a", "b", ""
388 : - This is also true if the final token is the initial token. E.g.
389 : assuming delim==';':
390 : "" has no tokens
391 : " " has no tokens
392 : ";" has the token ""
393 : " ;" has the token ""
394 : " ; " has the token "" */
395 :
396 : ulong
397 : fd_cstr_tokenize( char ** tok,
398 : ulong tok_max,
399 : char * cstr,
400 : char delim );
401 :
402 : /* fd_cstr_append_utf8 appends the UTF-8 encoding of a Unicode code
403 : point into p. Assumes p is valid (non-NULL and room for 1-4 chars
404 : and a final terminating '\0'). */
405 :
406 : static inline char *
407 : fd_cstr_append_utf8( char * p,
408 411 : uint rune ) {
409 411 : if( FD_LIKELY( rune<=0x7f ) ) {
410 387 : *(p++) = (char)rune;
411 387 : } else if( rune<=0x7ff ) {
412 6 : *(p++) = (char)( 0xc0 | (rune>>6) );
413 6 : *(p++) = (char)( 0x80 | ((rune )&0x3f) );
414 18 : } else if( rune<=0xffff ) {
415 12 : *(p++) = (char)( 0xe0 | (rune>>12) );
416 12 : *(p++) = (char)( 0x80 | ((rune>> 6)&0x3f) );
417 12 : *(p++) = (char)( 0x80 | ((rune )&0x3f) );
418 12 : } else if( rune<=0x10ffff ) {
419 6 : *(p++) = (char)( 0xf0 | (rune>>18) );
420 6 : *(p++) = (char)( 0x80 | ((rune>>12)&0x3f) );
421 6 : *(p++) = (char)( 0x80 | ((rune>> 6)&0x3f) );
422 6 : *(p++) = (char)( 0x80 | (rune &0x3f) );
423 6 : } else {
424 : /* replacement char */
425 0 : *(p++) = (char)0xef;
426 0 : *(p++) = (char)0xbf;
427 0 : *(p++) = (char)0xbd;
428 0 : }
429 411 : return p;
430 411 : }
431 :
432 : FD_PROTOTYPES_END
433 :
434 : /* The below macros guarantee the corresponding ctype.h functions return
435 : a value strictly in [0,1]. These still need the caller to include
436 : <ctype.h> to use.
437 :
438 : For context, the vast majority of developers reasonably expect
439 : ctype.h functions (like isspace) return 1/0 if the given character
440 : is/is not in the tested class.
441 :
442 : But the standard actually says these functions return non-zero/0.
443 :
444 : Many common standard libraries exploit this ambiguity. For example,
445 : isspace('\n') returns 8192 on recent linux-gcc-x86_64.
446 :
447 : In most usage, this subtle distinction does not make a difference.
448 :
449 : That makes it worse.
450 :
451 : When the distinction matters, it is incredibly difficult and time
452 : consuming to debug. Consider using ctype.h functions in user input
453 : parsing to compute the case of a switch statement. Suddenly,
454 : seemingly innocous code run on seemingly innocous input generates a
455 : branch to a mystifying place. Hilarity ensues.
456 :
457 : Worse still, this is a massive security risk. Consider using ctype.h
458 : in a mission critical VM implementation. Strictly deterministic
459 : verifiable cross platform behavior is a necessity. Should a
460 : malicious smart contract be able to halt a chain because the standard
461 : absent mindedly gave isspace dubious flexibility on its return value?
462 :
463 : That is, this behavior is absolutely vile.
464 :
465 : This is an example of "implicitly specified behavior". There is some
466 : behavior that must followed bit-for-bit. But nobody knows what that
467 : behavior is because the standard was written poorly. And then the
468 : library implementation blindly followed the standard and ignored
469 : developer usability in ambiguous situations.
470 :
471 : This also violates the core UNIX principle (and generally good idea
472 : in engineering or beyond) of "be generous in what you accept and
473 : strict in what you produce". Given the generally absymal language
474 : handling of boolean values, the only practical way to handle
475 : true/false reliably and efficiently is accept non-zero/0 but only
476 : produce 1/0. (And, since both sides have some responsibility in the
477 : examples above, the library implementation is more at fault as it is
478 : more foundational, more trusted and more reused. Library code should
479 : be held to a higher standard than code built on top of it.)
480 :
481 : Any use of naked ctype.h functions should be considered harmful and
482 : eradicated with extreme prejudice until the language and standard
483 : library implementations get a clue (not holding my breath).
484 :
485 : TL;DR ctype.h but sane. */
486 :
487 3492111 : #define fd_isalnum(c) (!!isalnum((c)))
488 768 : #define fd_isalpha(c) (!!isalpha((c)))
489 768 : #define fd_iscntrl(c) (!!iscntrl((c)))
490 191220 : #define fd_isdigit(c) (!!isdigit((c)))
491 768 : #define fd_isgraph(c) (!!isgraph((c)))
492 768 : #define fd_islower(c) (!!islower((c)))
493 768 : #define fd_isprint(c) (!!isprint((c)))
494 3152973 : #define fd_ispunct(c) (!!ispunct((c)))
495 336978 : #define fd_isspace(c) (!!isspace((c)))
496 768 : #define fd_isupper(c) (!!isupper((c)))
497 192369 : #define fd_isxdigit(c) (!!isxdigit((c)))
498 768 : #define fd_isascii(c) (!!isascii((c)))
499 768 : #define fd_isblank(c) (!!isblank((c)))
500 :
501 : #endif /* HEADER_fd_src_util_cstr_fd_cstr_h */
|