LCOV - code coverage report
Current view: top level - ballet/utf8 - fd_utf8.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 111 111 100.0 %
Date: 2026-06-29 05:51:35 Functions: 7 7 100.0 %

          Line data    Source code
       1             : #include "fd_utf8.h"
       2             : 
       3             : static uchar const fd_utf8_dfa[ 256 + 9*16 ] = {
       4             :   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       5             :   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       6             :   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       7             :   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       8             :   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
       9             :   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
      10             :   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
      11             :   0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3,
      12             :   0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,
      13             :   0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1,
      14             :   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1,
      15             :   1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,
      16             :   1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1,
      17             :   1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
      18             : };
      19             : 
      20             : #if FD_HAS_AVX512
      21             : #include "../../util/simd/fd_avx512.h"
      22             : 
      23             : /* AVX512 UTF-8 validator based on https://arxiv.org/pdf/2010.03090
      24             :    "Validating UTF-8 In Less Than One Instruction Per Byte"  */
      25             : 
      26             : static inline wwb_t
      27             : prev_carry( wwb_t cur,
      28  2560823951 :             wwb_t prev ) {
      29  2560823951 :   wwb_t idx = _mm512_set_epi64( 5, 4, 3, 2, 1, 0, 15, 14 );
      30  2560823951 :   return _mm512_permutex2var_epi64( cur, idx, prev );
      31  2560823951 : }
      32             : 
      33             : static inline wwb_t
      34             : prev1_byte( wwb_t cur,
      35   920335391 :             wwb_t prev ) {
      36   920335391 :   return _mm512_alignr_epi8( cur, prev_carry( cur, prev ), 15 );
      37   920335391 : }
      38             : 
      39             : static inline wwb_t
      40             : prev2_byte( wwb_t cur,
      41   820244280 :             wwb_t prev ) {
      42   820244280 :   return _mm512_alignr_epi8( cur, prev_carry( cur, prev ), 14 );
      43   820244280 : }
      44             : 
      45             : static inline wwb_t
      46             : prev3_byte( wwb_t cur,
      47   820244280 :             wwb_t prev ) {
      48   820244280 :   return _mm512_alignr_epi8( cur, prev_carry( cur, prev ), 13 );
      49   820244280 : }
      50             : 
      51             : /* Check for special-case invalid byte sequences.  These arise from
      52             :    overlong encodings, surrogates, and codepoints > U+10FFFF.
      53             : 
      54             :    After certain lead bytes, the valid range of the next byte is
      55             :    restricted:
      56             :     E0 xx: second byte must be in [A0,BF] (not [80,9F] -> overlong)
      57             :     ED xx: second byte must be in [80,9F] (not [A0,BF] -> surrogates)
      58             :     F0 xx: second byte must be in [90,BF] (not [80,8F] -> overlong)
      59             :     F4 xx: second byte must be in [80,8F] (not [90,BF] -> out of range) */
      60             : static inline wwb_t
      61             : check_special( wwb_t cur,
      62   100091111 :                wwb_t prev ) {
      63   100091111 :   wwb_t prev_hi  = wwb_shr( prev, 4 );
      64   100091111 :   wwb_t prev_lo  = wwb_and( prev, wwb_bcast( 0x0F ) );
      65   100091111 :   wwb_t cur_hi   = wwb_shr( cur,  4 );
      66             : 
      67             :   /* High nibble of prev: which category of checks apply.
      68             :      nibble E -> bits 0,1 (overlong-3 / surrogate checks)
      69             :      nibble F -> bits 2,3 (overlong-4 / too-large checks) */
      70   100091111 :   wwb_t hi_lut = wwb_bcast_hex( 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
      71   100091111 :                                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x0C );
      72             : 
      73             :   /* Low nibble of prev: which specific byte within the category.
      74             :      0x0     -> bits 0,2   (E0 overlong-3, F0 overlong-4)
      75             :      0x4     -> bit  3     (F4 too-large)
      76             :      0x5-0xC -> bits 2,3   (F5-FC: entirely invalid, catch all conts)
      77             :      0xD     -> bits 1,2,3 (ED surrogate; FD entirely invalid)
      78             :      0xE-0xF -> bits 2,3   (FE-FF: entirely invalid)
      79             :      The E-nibble category uses bits 0,1 so the bits 2,3 entries for
      80             :      F5-FF do not interfere with E5-EF (0x03 & 0x0C == 0). */
      81   100091111 :   wwb_t lo_lut = wwb_bcast_hex( 0x05, 0x00, 0x00, 0x00, 0x08, 0x0C, 0x0C, 0x0C,
      82   100091111 :                                 0x0C, 0x0C, 0x0C, 0x0C, 0x0C, 0x0E, 0x0C, 0x0C );
      83             : 
      84             :   /* High nibble of cur: which continuation byte ranges are bad.
      85             :      nibble 8 -> bits 0,2   (80..8F: bad after E0 and F0)
      86             :      nibble 9 -> bits 0,3   (90..9F: bad after E0 and F4+)
      87             :      nibble A -> bits 1,3   (A0..AF: bad after ED and F4+)
      88             :      nibble B -> bits 1,3   (B0..BF: bad after ED and F4+) */
      89   100091111 :   wwb_t cur_lut = wwb_bcast_hex( 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
      90   100091111 :                                  0x05, 0x09, 0x0A, 0x0A, 0x00, 0x00, 0x00, 0x00 );
      91             : 
      92   100091111 :   wwb_t r1 = _mm512_shuffle_epi8( hi_lut,  prev_hi );
      93   100091111 :   wwb_t r2 = _mm512_shuffle_epi8( lo_lut,  prev_lo );
      94   100091111 :   wwb_t r3 = _mm512_shuffle_epi8( cur_lut, cur_hi  );
      95   100091111 :   return _mm512_ternarylogic_epi64( r1, r2, r3, 0x80 ); /* r1 & r2 & r3 */
      96   100091111 : }
      97             : 
      98             : /* A 2-byte lead (110xxxxx, C2..DF) must be followed by 1 continuation.
      99             :    A 3-byte lead (1110xxxx, E0..EF) must be followed by 2 continuations.
     100             :    A 4-byte lead (11110xxx, F0..F4) must be followed by 3 continuations. */
     101             : static inline wwb_t
     102             : check_continuations( wwb_t cur,
     103   100091111 :                      wwb_t prev ) {
     104   100091111 :   wwb_t p1 = prev1_byte( cur, prev );
     105   100091111 :   wwb_t p2 = prev2_byte( cur, prev );
     106   100091111 :   wwb_t p3 = prev3_byte( cur, prev );
     107             : 
     108             :   /* 2+ byte lead: byte >= C2 (C0,C1 are overlong, never valid)
     109             :      3+ byte lead: byte >= E0
     110             :      4  byte lead: byte >= F0 (only F0..F4 valid, but DFA handles that) */
     111   100091111 :   wwb_t is_2_3_4_lead = wwb_subs( p1, wwb_bcast( 0xC1 ) );
     112   100091111 :   wwb_t is_3_4_lead   = wwb_subs( p2, wwb_bcast( 0xDF ) );
     113   100091111 :   wwb_t is_4_lead     = wwb_subs( p3, wwb_bcast( 0xEF ) );
     114             : 
     115             :   /* OR all together */
     116   100091111 :   wwb_t must_be_cont = _mm512_ternarylogic_epi64( is_2_3_4_lead,
     117   100091111 :                                                     is_3_4_lead,
     118   100091111 :                                                       is_4_lead, 0xFE );
     119             : 
     120             :   /* is_cont = sub(byte ^ 0x80, 0x3F). 0 means IS continuation, otherwise NOT. */
     121   100091111 :   wwb_t xor     = wwb_xor( cur, wwb_bcast( 0x80 ) );
     122   100091111 :   wwb_t is_cont = wwb_subs( xor, wwb_bcast( 0x3F ) );
     123             : 
     124   100091111 :   ulong must_mask = wwb_ne( must_be_cont, wwb_zero() );
     125   100091111 :   ulong cont_mask = wwb_eq( is_cont,      wwb_zero() );
     126   100091111 :   return _mm512_movm_epi8( must_mask ^ cont_mask );
     127   100091111 : }
     128             : 
     129             : FD_FN_PURE int
     130             : fd_utf8_verify( char const * str,
     131    20012041 :                 ulong        sz ) {
     132             : 
     133    20012041 :   uchar const * cur = (uchar const *)str;
     134    20012041 :   if( FD_UNLIKELY( cur==NULL ) ) return !sz;
     135    20012037 :   uchar const * const end = cur + sz;
     136             : 
     137    20012037 :   wwb_t prev_chunk = wwb_zero();
     138    20012037 :   wwb_t error      = wwb_zero();
     139             : 
     140   840256317 :   while( cur+64<=end ) { /* While we have a zmm register of unicode left. */
     141   820244280 :     wwb_t chunk = wwb_ldu( cur );
     142             : 
     143             :     /* Fast path, we've loaded an entire chunk of ASCII */
     144   820244280 :     if( FD_LIKELY( !_mm512_test_epi8_mask( chunk, wwb_bcast( 0x80 ) ) ) ) {
     145             :       /* Make sure we aren't mid-sequence from the previous chunk.
     146             :          If prev_chunk ended with a lead byte (>= C2), that's an error
     147             :          because the expected continuations landed in this all-ASCII chunk.
     148             : 
     149             :          check_continuations() would have caught this, but we skip it on
     150             :          the fast path, so check if any byte in prev_chunk's last 3
     151             :          positions is a lead byte. */
     152   720153169 :       wwb_t p1 = prev1_byte( chunk, prev_chunk );
     153   720153169 :       wwb_t p2 = prev2_byte( chunk, prev_chunk );
     154   720153169 :       wwb_t p3 = prev3_byte( chunk, prev_chunk );
     155             : 
     156   720153169 :       error = wwb_or( error, wwb_subs( p1, wwb_bcast( 0xC1 ) ) );
     157   720153169 :       error = wwb_or( error, wwb_subs( p2, wwb_bcast( 0xDF ) ) );
     158   720153169 :       error = wwb_or( error, wwb_subs( p3, wwb_bcast( 0xEF ) ) );
     159             : 
     160   720153169 :       prev_chunk = chunk;
     161   720153169 :       cur += 64;
     162   720153169 :       continue;
     163   720153169 :     }
     164             : 
     165   100091111 :     error = wwb_or( error, check_special( chunk, prev1_byte( chunk, prev_chunk ) ) );
     166   100091111 :     error = wwb_or( error, check_continuations( chunk, prev_chunk ) );
     167             :     /* C0 and C1 are not valid in any context (overlong 2-byte leads).
     168             :        check_continuations skips them (threshold 0xC2), so detect them
     169             :        explicitly. (byte & 0xFE) == 0xC0 matches exactly C0 and C1. */
     170   100091111 :     error = wwb_or( error, _mm512_movm_epi8(
     171   100091111 :         wwb_eq( wwb_and( chunk, wwb_bcast( 0xFE ) ), wwb_bcast( 0xC0 ) ) ) );
     172   100091111 :     prev_chunk = chunk;
     173   100091111 :     cur += 64;
     174   100091111 :   }
     175             : 
     176    20012037 :   if( cur >= end ) {
     177             :     /* No tail bytes remain, so we check for an incomplete sequence at
     178             :        the end of the last chunk. We simply ignore all bytes other than
     179             :        the 4/3/2-byte leads. */
     180    18009088 :     wwb_t max_val = wwb(
     181    18009088 :       -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
     182    18009088 :       -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
     183    18009088 :       -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
     184    18009088 :       -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
     185    18009088 :       (char)0xEF, (char)0xDF, (char)0xBF );
     186    18009088 :     error = wwb_or( error, wwb_subs( prev_chunk, max_val ) );
     187    18009088 :   }
     188             : 
     189    20012037 :   if( FD_UNLIKELY( _mm512_test_epi8_mask( error, error ) ) ) return 0;
     190             : 
     191     8008976 :   if( cur < end ) {
     192             :     /* There are still tail bytes left, which could contain an
     193             :        incomplete multi-byte sequence from the end of the last SIMD
     194             :        chunk. We need to backup the current pointer so that the DFA
     195             :        will re-validate, starting from a known valid point. */
     196     2002949 :     uchar const * base = (uchar const *)str;
     197     2002949 :     if( cur > base ) {
     198          10 :       if(      cur[-1] >= 0xC0U )
     199           6 :         cur -= 1;
     200           4 :       else if( cur[-1] >= 0x80U && cur > base+1 && cur[-2] >= 0xE0U )
     201           1 :         cur -= 2;
     202           3 :       else if( cur[-1] >= 0x80U && cur > base+2 &&
     203           3 :                cur[-2] >= 0x80U && cur[-3] >= 0xF0U )
     204           1 :         cur -= 3;
     205          10 :     }
     206     2002949 :     uint state = 0;
     207    66069764 :     while( cur<end ) {
     208    64066815 :       uint type = fd_utf8_dfa[ *cur++ ];
     209    64066815 :       state = fd_utf8_dfa[ 256 + state*16 + type ];
     210    64066815 :     }
     211     2002949 :     if( state!=0 ) return 0;
     212     2002949 :   }
     213             : 
     214     8008387 :   return 1;
     215     8008976 : }
     216             : 
     217             : #else
     218             : 
     219             : FD_FN_PURE int
     220             : fd_utf8_verify( char const * str,
     221        2082 :                 ulong        sz ) {
     222             : 
     223        2082 :   uchar const * cur = (uchar const *)str;
     224        2082 :   if( FD_UNLIKELY( cur==NULL ) ) return !sz;
     225        2074 :   uchar const * const end = cur + sz;
     226             : 
     227        2074 :   uint state = 0;
     228       43522 :   while( cur<end ) {
     229       41448 :     uint type = fd_utf8_dfa[ *cur++ ];
     230       41448 :     state = fd_utf8_dfa[ 256 + state*16 + type ];
     231       41448 :   }
     232        2074 :   return state == 0;
     233        2082 : }
     234             : 
     235             : #endif /* FD_HAS_AVX512 */

Generated by: LCOV version 1.14