Line data Source code
1 : #include "fd_utf8.h"
2 :
3 : static uchar const fd_utf8_dfa[ 256 + 9*16 ] = {
4 : 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
5 : 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
6 : 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
7 : 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
8 : 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
9 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
10 : 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
11 : 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3,
12 : 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,
13 : 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1,
14 : 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1,
15 : 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,
16 : 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1,
17 : 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
18 : };
19 :
20 : #if FD_HAS_AVX512
21 : #include "../../util/simd/fd_avx512.h"
22 :
23 : /* AVX512 UTF-8 validator based on https://arxiv.org/pdf/2010.03090
24 : "Validating UTF-8 In Less Than One Instruction Per Byte" */
25 :
26 : static inline wwb_t
27 : prev_carry( wwb_t cur,
28 2560823951 : wwb_t prev ) {
29 2560823951 : wwb_t idx = _mm512_set_epi64( 5, 4, 3, 2, 1, 0, 15, 14 );
30 2560823951 : return _mm512_permutex2var_epi64( cur, idx, prev );
31 2560823951 : }
32 :
33 : static inline wwb_t
34 : prev1_byte( wwb_t cur,
35 920335391 : wwb_t prev ) {
36 920335391 : return _mm512_alignr_epi8( cur, prev_carry( cur, prev ), 15 );
37 920335391 : }
38 :
39 : static inline wwb_t
40 : prev2_byte( wwb_t cur,
41 820244280 : wwb_t prev ) {
42 820244280 : return _mm512_alignr_epi8( cur, prev_carry( cur, prev ), 14 );
43 820244280 : }
44 :
45 : static inline wwb_t
46 : prev3_byte( wwb_t cur,
47 820244280 : wwb_t prev ) {
48 820244280 : return _mm512_alignr_epi8( cur, prev_carry( cur, prev ), 13 );
49 820244280 : }
50 :
51 : /* Check for special-case invalid byte sequences. These arise from
52 : overlong encodings, surrogates, and codepoints > U+10FFFF.
53 :
54 : After certain lead bytes, the valid range of the next byte is
55 : restricted:
56 : E0 xx: second byte must be in [A0,BF] (not [80,9F] -> overlong)
57 : ED xx: second byte must be in [80,9F] (not [A0,BF] -> surrogates)
58 : F0 xx: second byte must be in [90,BF] (not [80,8F] -> overlong)
59 : F4 xx: second byte must be in [80,8F] (not [90,BF] -> out of range) */
60 : static inline wwb_t
61 : check_special( wwb_t cur,
62 100091111 : wwb_t prev ) {
63 100091111 : wwb_t prev_hi = wwb_shr( prev, 4 );
64 100091111 : wwb_t prev_lo = wwb_and( prev, wwb_bcast( 0x0F ) );
65 100091111 : wwb_t cur_hi = wwb_shr( cur, 4 );
66 :
67 : /* High nibble of prev: which category of checks apply.
68 : nibble E -> bits 0,1 (overlong-3 / surrogate checks)
69 : nibble F -> bits 2,3 (overlong-4 / too-large checks) */
70 100091111 : wwb_t hi_lut = wwb_bcast_hex( 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
71 100091111 : 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x0C );
72 :
73 : /* Low nibble of prev: which specific byte within the category.
74 : 0x0 -> bits 0,2 (E0 overlong-3, F0 overlong-4)
75 : 0x4 -> bit 3 (F4 too-large)
76 : 0x5-0xC -> bits 2,3 (F5-FC: entirely invalid, catch all conts)
77 : 0xD -> bits 1,2,3 (ED surrogate; FD entirely invalid)
78 : 0xE-0xF -> bits 2,3 (FE-FF: entirely invalid)
79 : The E-nibble category uses bits 0,1 so the bits 2,3 entries for
80 : F5-FF do not interfere with E5-EF (0x03 & 0x0C == 0). */
81 100091111 : wwb_t lo_lut = wwb_bcast_hex( 0x05, 0x00, 0x00, 0x00, 0x08, 0x0C, 0x0C, 0x0C,
82 100091111 : 0x0C, 0x0C, 0x0C, 0x0C, 0x0C, 0x0E, 0x0C, 0x0C );
83 :
84 : /* High nibble of cur: which continuation byte ranges are bad.
85 : nibble 8 -> bits 0,2 (80..8F: bad after E0 and F0)
86 : nibble 9 -> bits 0,3 (90..9F: bad after E0 and F4+)
87 : nibble A -> bits 1,3 (A0..AF: bad after ED and F4+)
88 : nibble B -> bits 1,3 (B0..BF: bad after ED and F4+) */
89 100091111 : wwb_t cur_lut = wwb_bcast_hex( 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
90 100091111 : 0x05, 0x09, 0x0A, 0x0A, 0x00, 0x00, 0x00, 0x00 );
91 :
92 100091111 : wwb_t r1 = _mm512_shuffle_epi8( hi_lut, prev_hi );
93 100091111 : wwb_t r2 = _mm512_shuffle_epi8( lo_lut, prev_lo );
94 100091111 : wwb_t r3 = _mm512_shuffle_epi8( cur_lut, cur_hi );
95 100091111 : return _mm512_ternarylogic_epi64( r1, r2, r3, 0x80 ); /* r1 & r2 & r3 */
96 100091111 : }
97 :
98 : /* A 2-byte lead (110xxxxx, C2..DF) must be followed by 1 continuation.
99 : A 3-byte lead (1110xxxx, E0..EF) must be followed by 2 continuations.
100 : A 4-byte lead (11110xxx, F0..F4) must be followed by 3 continuations. */
101 : static inline wwb_t
102 : check_continuations( wwb_t cur,
103 100091111 : wwb_t prev ) {
104 100091111 : wwb_t p1 = prev1_byte( cur, prev );
105 100091111 : wwb_t p2 = prev2_byte( cur, prev );
106 100091111 : wwb_t p3 = prev3_byte( cur, prev );
107 :
108 : /* 2+ byte lead: byte >= C2 (C0,C1 are overlong, never valid)
109 : 3+ byte lead: byte >= E0
110 : 4 byte lead: byte >= F0 (only F0..F4 valid, but DFA handles that) */
111 100091111 : wwb_t is_2_3_4_lead = wwb_subs( p1, wwb_bcast( 0xC1 ) );
112 100091111 : wwb_t is_3_4_lead = wwb_subs( p2, wwb_bcast( 0xDF ) );
113 100091111 : wwb_t is_4_lead = wwb_subs( p3, wwb_bcast( 0xEF ) );
114 :
115 : /* OR all together */
116 100091111 : wwb_t must_be_cont = _mm512_ternarylogic_epi64( is_2_3_4_lead,
117 100091111 : is_3_4_lead,
118 100091111 : is_4_lead, 0xFE );
119 :
120 : /* is_cont = sub(byte ^ 0x80, 0x3F). 0 means IS continuation, otherwise NOT. */
121 100091111 : wwb_t xor = wwb_xor( cur, wwb_bcast( 0x80 ) );
122 100091111 : wwb_t is_cont = wwb_subs( xor, wwb_bcast( 0x3F ) );
123 :
124 100091111 : ulong must_mask = wwb_ne( must_be_cont, wwb_zero() );
125 100091111 : ulong cont_mask = wwb_eq( is_cont, wwb_zero() );
126 100091111 : return _mm512_movm_epi8( must_mask ^ cont_mask );
127 100091111 : }
128 :
129 : FD_FN_PURE int
130 : fd_utf8_verify( char const * str,
131 20012041 : ulong sz ) {
132 :
133 20012041 : uchar const * cur = (uchar const *)str;
134 20012041 : if( FD_UNLIKELY( cur==NULL ) ) return !sz;
135 20012037 : uchar const * const end = cur + sz;
136 :
137 20012037 : wwb_t prev_chunk = wwb_zero();
138 20012037 : wwb_t error = wwb_zero();
139 :
140 840256317 : while( cur+64<=end ) { /* While we have a zmm register of unicode left. */
141 820244280 : wwb_t chunk = wwb_ldu( cur );
142 :
143 : /* Fast path, we've loaded an entire chunk of ASCII */
144 820244280 : if( FD_LIKELY( !_mm512_test_epi8_mask( chunk, wwb_bcast( 0x80 ) ) ) ) {
145 : /* Make sure we aren't mid-sequence from the previous chunk.
146 : If prev_chunk ended with a lead byte (>= C2), that's an error
147 : because the expected continuations landed in this all-ASCII chunk.
148 :
149 : check_continuations() would have caught this, but we skip it on
150 : the fast path, so check if any byte in prev_chunk's last 3
151 : positions is a lead byte. */
152 720153169 : wwb_t p1 = prev1_byte( chunk, prev_chunk );
153 720153169 : wwb_t p2 = prev2_byte( chunk, prev_chunk );
154 720153169 : wwb_t p3 = prev3_byte( chunk, prev_chunk );
155 :
156 720153169 : error = wwb_or( error, wwb_subs( p1, wwb_bcast( 0xC1 ) ) );
157 720153169 : error = wwb_or( error, wwb_subs( p2, wwb_bcast( 0xDF ) ) );
158 720153169 : error = wwb_or( error, wwb_subs( p3, wwb_bcast( 0xEF ) ) );
159 :
160 720153169 : prev_chunk = chunk;
161 720153169 : cur += 64;
162 720153169 : continue;
163 720153169 : }
164 :
165 100091111 : error = wwb_or( error, check_special( chunk, prev1_byte( chunk, prev_chunk ) ) );
166 100091111 : error = wwb_or( error, check_continuations( chunk, prev_chunk ) );
167 : /* C0 and C1 are not valid in any context (overlong 2-byte leads).
168 : check_continuations skips them (threshold 0xC2), so detect them
169 : explicitly. (byte & 0xFE) == 0xC0 matches exactly C0 and C1. */
170 100091111 : error = wwb_or( error, _mm512_movm_epi8(
171 100091111 : wwb_eq( wwb_and( chunk, wwb_bcast( 0xFE ) ), wwb_bcast( 0xC0 ) ) ) );
172 100091111 : prev_chunk = chunk;
173 100091111 : cur += 64;
174 100091111 : }
175 :
176 20012037 : if( cur >= end ) {
177 : /* No tail bytes remain, so we check for an incomplete sequence at
178 : the end of the last chunk. We simply ignore all bytes other than
179 : the 4/3/2-byte leads. */
180 18009088 : wwb_t max_val = wwb(
181 18009088 : -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
182 18009088 : -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
183 18009088 : -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
184 18009088 : -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
185 18009088 : (char)0xEF, (char)0xDF, (char)0xBF );
186 18009088 : error = wwb_or( error, wwb_subs( prev_chunk, max_val ) );
187 18009088 : }
188 :
189 20012037 : if( FD_UNLIKELY( _mm512_test_epi8_mask( error, error ) ) ) return 0;
190 :
191 8008976 : if( cur < end ) {
192 : /* There are still tail bytes left, which could contain an
193 : incomplete multi-byte sequence from the end of the last SIMD
194 : chunk. We need to backup the current pointer so that the DFA
195 : will re-validate, starting from a known valid point. */
196 2002949 : uchar const * base = (uchar const *)str;
197 2002949 : if( cur > base ) {
198 10 : if( cur[-1] >= 0xC0U )
199 6 : cur -= 1;
200 4 : else if( cur[-1] >= 0x80U && cur > base+1 && cur[-2] >= 0xE0U )
201 1 : cur -= 2;
202 3 : else if( cur[-1] >= 0x80U && cur > base+2 &&
203 3 : cur[-2] >= 0x80U && cur[-3] >= 0xF0U )
204 1 : cur -= 3;
205 10 : }
206 2002949 : uint state = 0;
207 66069764 : while( cur<end ) {
208 64066815 : uint type = fd_utf8_dfa[ *cur++ ];
209 64066815 : state = fd_utf8_dfa[ 256 + state*16 + type ];
210 64066815 : }
211 2002949 : if( state!=0 ) return 0;
212 2002949 : }
213 :
214 8008387 : return 1;
215 8008976 : }
216 :
217 : #else
218 :
219 : FD_FN_PURE int
220 : fd_utf8_verify( char const * str,
221 2082 : ulong sz ) {
222 :
223 2082 : uchar const * cur = (uchar const *)str;
224 2082 : if( FD_UNLIKELY( cur==NULL ) ) return !sz;
225 2074 : uchar const * const end = cur + sz;
226 :
227 2074 : uint state = 0;
228 43522 : while( cur<end ) {
229 41448 : uint type = fd_utf8_dfa[ *cur++ ];
230 41448 : state = fd_utf8_dfa[ 256 + state*16 + type ];
231 41448 : }
232 2074 : return state == 0;
233 2082 : }
234 :
235 : #endif /* FD_HAS_AVX512 */
|