encoding_rs/ascii.rs
1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10// It's assumed that in due course Rust will have explicit SIMD but will not
11// be good at run-time selection of SIMD vs. no-SIMD. In such a future,
12// x86_64 will always use SSE2 and 32-bit x86 will use SSE2 when compiled with
13// a Mozilla-shipped rustc. SIMD support and especially detection on ARM is a
14// mess. Under the circumstances, it seems to make sense to optimize the ALU
15// case for ARMv7 rather than x86. Annoyingly, I was unable to get useful
16// numbers of the actual ARMv7 CPU I have access to, because (thermal?)
17// throttling kept interfering. Since Raspberry Pi 3 (ARMv8 core but running
18// ARMv7 code) produced reproducible performance numbers, that's the ARM
19// computer that this code ended up being optimized for in the ALU case.
20// Less popular CPU architectures simply get the approach that was chosen based
21// on Raspberry Pi 3 measurements. The UTF-16 and UTF-8 ALU cases take
22// different approaches based on benchmarking on Raspberry Pi 3.
23
24#[cfg(all(
25 feature = "simd-accel",
26 any(
27 target_feature = "sse2",
28 all(target_endian = "little", target_arch = "aarch64"),
29 all(target_endian = "little", target_feature = "neon")
30 )
31))]
32use crate::simd_funcs::*;
33
34cfg_if! {
35 if #[cfg(feature = "simd-accel")] {
36 #[allow(unused_imports)]
37 use ::core::intrinsics::unlikely;
38 #[allow(unused_imports)]
39 use ::core::intrinsics::likely;
40 } else {
41 #[allow(dead_code)]
42 #[inline(always)]
43 fn unlikely(b: bool) -> bool {
44 b
45 }
46 #[allow(dead_code)]
47 #[inline(always)]
48 fn likely(b: bool) -> bool {
49 b
50 }
51 }
52}
53
54// Safety invariants for masks: data & mask = 0 for valid ASCII or basic latin utf-16
55
56// `as` truncates, so works on 32-bit, too.
57#[allow(dead_code)]
58pub const ASCII_MASK: usize = 0x8080_8080_8080_8080u64 as usize;
59
60// `as` truncates, so works on 32-bit, too.
61#[allow(dead_code)]
62pub const BASIC_LATIN_MASK: usize = 0xFF80_FF80_FF80_FF80u64 as usize;
63
64#[allow(unused_macros)]
65macro_rules! ascii_naive {
66 ($name:ident, $src_unit:ty, $dst_unit:ty) => {
67 /// Safety: src and dst must have len_unit elements and be aligned
68 /// Safety-usable invariant: will return Some() when it fails
69 /// to convert. The first value will be a u8 that is > 127.
70 #[inline(always)]
71 pub unsafe fn $name(
72 src: *const $src_unit,
73 dst: *mut $dst_unit,
74 len: usize,
75 ) -> Option<($src_unit, usize)> {
76 // Yes, manually omitting the bound check here matters
77 // a lot for perf.
78 for i in 0..len {
79 // Safety: len invariant used here
80 let code_unit = *(src.add(i));
81 // Safety: Upholds safety-usable invariant here
82 if code_unit > 127 {
83 return Some((code_unit, i));
84 }
85 // Safety: len invariant used here
86 *(dst.add(i)) = code_unit as $dst_unit;
87 }
88 return None;
89 }
90 };
91}
92
93#[allow(unused_macros)]
94macro_rules! ascii_alu {
95 ($name:ident,
96 // safety invariant: src/dst MUST be u8
97 $src_unit:ty,
98 $dst_unit:ty,
99 // Safety invariant: stride_fn must consume and produce two usizes, and return the index of the first non-ascii when it fails
100 $stride_fn:ident) => {
101 /// Safety: src and dst must have len elements, src is valid for read, dst is valid for
102 /// write
103 /// Safety-usable invariant: will return Some() when it fails
104 /// to convert. The first value will be a u8 that is > 127.
105 #[cfg_attr(feature = "cargo-clippy", allow(never_loop, cast_ptr_alignment))]
106 #[inline(always)]
107 pub unsafe fn $name(
108 src: *const $src_unit,
109 dst: *mut $dst_unit,
110 len: usize,
111 ) -> Option<($src_unit, usize)> {
112 let mut offset = 0usize;
113 // This loop is only broken out of as a `goto` forward
114 loop {
115 // Safety: until_alignment becomes the number of bytes we need to munch until we are aligned to usize
116 let mut until_alignment = {
117 // Check if the other unit aligns if we move the narrower unit
118 // to alignment.
119 // if ::core::mem::size_of::<$src_unit>() == ::core::mem::size_of::<$dst_unit>() {
120 // ascii_to_ascii
121 let src_alignment = (src as usize) & ALU_ALIGNMENT_MASK;
122 let dst_alignment = (dst as usize) & ALU_ALIGNMENT_MASK;
123 if src_alignment != dst_alignment {
124 // Safety: bails early and ends up in the naïve branch where usize-alignment doesn't matter
125 break;
126 }
127 (ALU_ALIGNMENT - src_alignment) & ALU_ALIGNMENT_MASK
128 // } else if ::core::mem::size_of::<$src_unit>() < ::core::mem::size_of::<$dst_unit>() {
129 // ascii_to_basic_latin
130 // let src_until_alignment = (ALIGNMENT - ((src as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK;
131 // if (dst.add(src_until_alignment) as usize) & ALIGNMENT_MASK != 0 {
132 // break;
133 // }
134 // src_until_alignment
135 // } else {
136 // basic_latin_to_ascii
137 // let dst_until_alignment = (ALIGNMENT - ((dst as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK;
138 // if (src.add(dst_until_alignment) as usize) & ALIGNMENT_MASK != 0 {
139 // break;
140 // }
141 // dst_until_alignment
142 // }
143 };
144 if until_alignment + ALU_STRIDE_SIZE <= len {
145 // Moving pointers to alignment seems to be a pessimization on
146 // x86_64 for operations that have UTF-16 as the internal
147 // Unicode representation. However, since it seems to be a win
148 // on ARM (tested ARMv7 code running on ARMv8 [rpi3]), except
149 // mixed results when encoding from UTF-16 and since x86 and
150 // x86_64 should be using SSE2 in due course, keeping the move
151 // to alignment here. It would be good to test on more ARM CPUs
152 // and on real MIPS and POWER hardware.
153 //
154 // Safety: This is the naïve code once again, for `until_alignment` bytes
155 while until_alignment != 0 {
156 let code_unit = *(src.add(offset));
157 if code_unit > 127 {
158 // Safety: Upholds safety-usable invariant here
159 return Some((code_unit, offset));
160 }
161 *(dst.add(offset)) = code_unit as $dst_unit;
162 // Safety: offset is the number of bytes copied so far
163 offset += 1;
164 until_alignment -= 1;
165 }
166 let len_minus_stride = len - ALU_STRIDE_SIZE;
167 loop {
168 // Safety: num_ascii is known to be a byte index of a non-ascii byte due to stride_fn's invariant
169 if let Some(num_ascii) = $stride_fn(
170 // Safety: These are known to be valid and aligned since we have at
171 // least ALU_STRIDE_SIZE data in these buffers, and offset is the
172 // number of elements copied so far, which according to the
173 // until_alignment calculation above will cause both src and dst to be
174 // aligned to usize after this add
175 src.add(offset) as *const usize,
176 dst.add(offset) as *mut usize,
177 ) {
178 offset += num_ascii;
179 // Safety: Upholds safety-usable invariant here by indexing into non-ascii byte
180 return Some((*(src.add(offset)), offset));
181 }
182 // Safety: offset continues to be the number of bytes copied so far, and
183 // maintains usize alignment for the next loop iteration
184 offset += ALU_STRIDE_SIZE;
185 // Safety: This is `offset > len - stride. This loop will continue as long as
186 // `offset <= len - stride`, which means there are `stride` bytes to still be read.
187 if offset > len_minus_stride {
188 break;
189 }
190 }
191 }
192 break;
193 }
194
195 // Safety: This is the naïve code, same as ascii_naive, and has no requirements
196 // other than src/dst being valid for the the right lens
197 while offset < len {
198 // Safety: len invariant used here
199 let code_unit = *(src.add(offset));
200 if code_unit > 127 {
201 // Safety: Upholds safety-usable invariant here
202 return Some((code_unit, offset));
203 }
204 // Safety: len invariant used here
205 *(dst.add(offset)) = code_unit as $dst_unit;
206 offset += 1;
207 }
208 None
209 }
210 };
211}
212
213#[allow(unused_macros)]
214macro_rules! basic_latin_alu {
215 ($name:ident,
216 // safety invariant: use u8 for src/dest for ascii, and u16 for basic_latin
217 $src_unit:ty,
218 $dst_unit:ty,
219 // safety invariant: stride function must munch ALU_STRIDE_SIZE*size(src_unit) bytes off of src and
220 // write ALU_STRIDE_SIZE*size(dst_unit) bytes to dst
221 $stride_fn:ident) => {
222 /// Safety: src and dst must have len elements, src is valid for read, dst is valid for
223 /// write
224 /// Safety-usable invariant: will return Some() when it fails
225 /// to convert. The first value will be a u8 that is > 127.
226 #[cfg_attr(
227 feature = "cargo-clippy",
228 allow(never_loop, cast_ptr_alignment, cast_lossless)
229 )]
230 #[inline(always)]
231 pub unsafe fn $name(
232 src: *const $src_unit,
233 dst: *mut $dst_unit,
234 len: usize,
235 ) -> Option<($src_unit, usize)> {
236 let mut offset = 0usize;
237 // This loop is only broken out of as a `goto` forward
238 loop {
239 // Safety: until_alignment becomes the number of bytes we need to munch from src/dest until we are aligned to usize
240 // We ensure basic-latin has the same alignment as ascii, starting with ascii since it is smaller.
241 let mut until_alignment = {
242 // Check if the other unit aligns if we move the narrower unit
243 // to alignment.
244 // if ::core::mem::size_of::<$src_unit>() == ::core::mem::size_of::<$dst_unit>() {
245 // ascii_to_ascii
246 // let src_alignment = (src as usize) & ALIGNMENT_MASK;
247 // let dst_alignment = (dst as usize) & ALIGNMENT_MASK;
248 // if src_alignment != dst_alignment {
249 // break;
250 // }
251 // (ALIGNMENT - src_alignment) & ALIGNMENT_MASK
252 // } else
253 if ::core::mem::size_of::<$src_unit>() < ::core::mem::size_of::<$dst_unit>() {
254 // ascii_to_basic_latin
255 let src_until_alignment = (ALU_ALIGNMENT
256 - ((src as usize) & ALU_ALIGNMENT_MASK))
257 & ALU_ALIGNMENT_MASK;
258 if (dst.wrapping_add(src_until_alignment) as usize) & ALU_ALIGNMENT_MASK
259 != 0
260 {
261 break;
262 }
263 src_until_alignment
264 } else {
265 // basic_latin_to_ascii
266 let dst_until_alignment = (ALU_ALIGNMENT
267 - ((dst as usize) & ALU_ALIGNMENT_MASK))
268 & ALU_ALIGNMENT_MASK;
269 if (src.wrapping_add(dst_until_alignment) as usize) & ALU_ALIGNMENT_MASK
270 != 0
271 {
272 break;
273 }
274 dst_until_alignment
275 }
276 };
277 if until_alignment + ALU_STRIDE_SIZE <= len {
278 // Moving pointers to alignment seems to be a pessimization on
279 // x86_64 for operations that have UTF-16 as the internal
280 // Unicode representation. However, since it seems to be a win
281 // on ARM (tested ARMv7 code running on ARMv8 [rpi3]), except
282 // mixed results when encoding from UTF-16 and since x86 and
283 // x86_64 should be using SSE2 in due course, keeping the move
284 // to alignment here. It would be good to test on more ARM CPUs
285 // and on real MIPS and POWER hardware.
286 //
287 // Safety: This is the naïve code once again, for `until_alignment` bytes
288 while until_alignment != 0 {
289 let code_unit = *(src.add(offset));
290 if code_unit > 127 {
291 // Safety: Upholds safety-usable invariant here
292 return Some((code_unit, offset));
293 }
294 *(dst.add(offset)) = code_unit as $dst_unit;
295 // Safety: offset is the number of bytes copied so far
296 offset += 1;
297 until_alignment -= 1;
298 }
299 let len_minus_stride = len - ALU_STRIDE_SIZE;
300 loop {
301 if !$stride_fn(
302 // Safety: These are known to be valid and aligned since we have at
303 // least ALU_STRIDE_SIZE data in these buffers, and offset is the
304 // number of elements copied so far, which according to the
305 // until_alignment calculation above will cause both src and dst to be
306 // aligned to usize after this add
307 src.add(offset) as *const usize,
308 dst.add(offset) as *mut usize,
309 ) {
310 break;
311 }
312 // Safety: offset continues to be the number of bytes copied so far, and
313 // maintains usize alignment for the next loop iteration
314 offset += ALU_STRIDE_SIZE;
315 // Safety: This is `offset > len - stride. This loop will continue as long as
316 // `offset <= len - stride`, which means there are `stride` bytes to still be read.
317 if offset > len_minus_stride {
318 break;
319 }
320 }
321 }
322 break;
323 }
324 // Safety: This is the naïve code once again, for leftover bytes
325 while offset < len {
326 // Safety: len invariant used here
327 let code_unit = *(src.add(offset));
328 if code_unit > 127 {
329 // Safety: Upholds safety-usable invariant here
330 return Some((code_unit, offset));
331 }
332 // Safety: len invariant used here
333 *(dst.add(offset)) = code_unit as $dst_unit;
334 offset += 1;
335 }
336 None
337 }
338 };
339}
340
341#[allow(unused_macros)]
342macro_rules! latin1_alu {
343 // safety invariant: stride function must munch ALU_STRIDE_SIZE*size(src_unit) bytes off of src and
344 // write ALU_STRIDE_SIZE*size(dst_unit) bytes to dst
345 ($name:ident, $src_unit:ty, $dst_unit:ty, $stride_fn:ident) => {
346 /// Safety: src and dst must have len elements, src is valid for read, dst is valid for
347 /// write
348 #[cfg_attr(
349 feature = "cargo-clippy",
350 allow(never_loop, cast_ptr_alignment, cast_lossless)
351 )]
352 #[inline(always)]
353 pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
354 let mut offset = 0usize;
355 // This loop is only broken out of as a `goto` forward
356 loop {
357 // Safety: until_alignment becomes the number of bytes we need to munch from src/dest until we are aligned to usize
358 // We ensure the UTF-16 side has the same alignment as the Latin-1 side, starting with Latin-1 since it is smaller.
359 let mut until_alignment = {
360 if ::core::mem::size_of::<$src_unit>() < ::core::mem::size_of::<$dst_unit>() {
361 // unpack
362 let src_until_alignment = (ALU_ALIGNMENT
363 - ((src as usize) & ALU_ALIGNMENT_MASK))
364 & ALU_ALIGNMENT_MASK;
365 if (dst.wrapping_add(src_until_alignment) as usize) & ALU_ALIGNMENT_MASK
366 != 0
367 {
368 break;
369 }
370 src_until_alignment
371 } else {
372 // pack
373 let dst_until_alignment = (ALU_ALIGNMENT
374 - ((dst as usize) & ALU_ALIGNMENT_MASK))
375 & ALU_ALIGNMENT_MASK;
376 if (src.wrapping_add(dst_until_alignment) as usize) & ALU_ALIGNMENT_MASK
377 != 0
378 {
379 break;
380 }
381 dst_until_alignment
382 }
383 };
384 if until_alignment + ALU_STRIDE_SIZE <= len {
385 // Safety: This is the naïve code once again, for `until_alignment` bytes
386 while until_alignment != 0 {
387 let code_unit = *(src.add(offset));
388 *(dst.add(offset)) = code_unit as $dst_unit;
389 // Safety: offset is the number of bytes copied so far
390 offset += 1;
391 until_alignment -= 1;
392 }
393 let len_minus_stride = len - ALU_STRIDE_SIZE;
394 loop {
395 $stride_fn(
396 // Safety: These are known to be valid and aligned since we have at
397 // least ALU_STRIDE_SIZE data in these buffers, and offset is the
398 // number of elements copied so far, which according to the
399 // until_alignment calculation above will cause both src and dst to be
400 // aligned to usize after this add
401 src.add(offset) as *const usize,
402 dst.add(offset) as *mut usize,
403 );
404 // Safety: offset continues to be the number of bytes copied so far, and
405 // maintains usize alignment for the next loop iteration
406 offset += ALU_STRIDE_SIZE;
407 // Safety: This is `offset > len - stride. This loop will continue as long as
408 // `offset <= len - stride`, which means there are `stride` bytes to still be read.
409 if offset > len_minus_stride {
410 break;
411 }
412 }
413 }
414 break;
415 }
416 // Safety: This is the naïve code once again, for leftover bytes
417 while offset < len {
418 // Safety: len invariant used here
419 let code_unit = *(src.add(offset));
420 *(dst.add(offset)) = code_unit as $dst_unit;
421 offset += 1;
422 }
423 }
424 };
425}
426
427#[allow(unused_macros)]
428macro_rules! ascii_simd_check_align {
429 (
430 $name:ident,
431 $src_unit:ty,
432 $dst_unit:ty,
433 // Safety: This function must require aligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
434 $stride_both_aligned:ident,
435 // Safety: This function must require aligned/unaligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
436 $stride_src_aligned:ident,
437 // Safety: This function must require unaligned/aligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
438 $stride_dst_aligned:ident,
439 // Safety: This function must require unaligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
440 $stride_neither_aligned:ident
441 ) => {
442 /// Safety: src/dst must be valid for reads/writes of `len` elements of their units.
443 ///
444 /// Safety-usable invariant: will return Some() when it encounters non-ASCII, with the first element in the Some being
445 /// guaranteed to be non-ASCII (> 127), and the second being the offset where it is found
446 #[inline(always)]
447 pub unsafe fn $name(
448 src: *const $src_unit,
449 dst: *mut $dst_unit,
450 len: usize,
451 ) -> Option<($src_unit, usize)> {
452 let mut offset = 0usize;
453 // Safety: if this check succeeds we're valid for reading/writing at least `SIMD_STRIDE_SIZE` elements.
454 if SIMD_STRIDE_SIZE <= len {
455 let len_minus_stride = len - SIMD_STRIDE_SIZE;
456 // XXX Should we first process one stride unconditionally as unaligned to
457 // avoid the cost of the branchiness below if the first stride fails anyway?
458 // XXX Should we just use unaligned SSE2 access unconditionally? It seems that
459 // on Haswell, it would make sense to just use unaligned and not bother
460 // checking. Need to benchmark older architectures before deciding.
461 let dst_masked = (dst as usize) & SIMD_ALIGNMENT_MASK;
462 // Safety: checking whether src is aligned
463 if ((src as usize) & SIMD_ALIGNMENT_MASK) == 0 {
464 // Safety: Checking whether dst is aligned
465 if dst_masked == 0 {
466 loop {
467 // Safety: We're valid to read/write SIMD_STRIDE_SIZE elements and have the appropriate alignments
468 if !$stride_both_aligned(src.add(offset), dst.add(offset)) {
469 break;
470 }
471 offset += SIMD_STRIDE_SIZE;
472 // Safety: This is `offset > len - SIMD_STRIDE_SIZE` which means we always have at least `SIMD_STRIDE_SIZE` elements to munch next time.
473 if offset > len_minus_stride {
474 break;
475 }
476 }
477 } else {
478 loop {
479 // Safety: We're valid to read/write SIMD_STRIDE_SIZE elements and have the appropriate alignments
480 if !$stride_src_aligned(src.add(offset), dst.add(offset)) {
481 break;
482 }
483 offset += SIMD_STRIDE_SIZE;
484 // Safety: This is `offset > len - SIMD_STRIDE_SIZE` which means we always have at least `SIMD_STRIDE_SIZE` elements to munch next time.
485 if offset > len_minus_stride {
486 break;
487 }
488 }
489 }
490 } else {
491 if dst_masked == 0 {
492 loop {
493 // Safety: We're valid to read/write SIMD_STRIDE_SIZE elements and have the appropriate alignments
494 if !$stride_dst_aligned(src.add(offset), dst.add(offset)) {
495 break;
496 }
497 offset += SIMD_STRIDE_SIZE;
498 // Safety: This is `offset > len - SIMD_STRIDE_SIZE` which means we always have at least `SIMD_STRIDE_SIZE` elements to munch next time.
499 if offset > len_minus_stride {
500 break;
501 }
502 }
503 } else {
504 loop {
505 // Safety: We're valid to read/write SIMD_STRIDE_SIZE elements and have the appropriate alignments
506 if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
507 break;
508 }
509 offset += SIMD_STRIDE_SIZE;
510 // Safety: This is `offset > len - SIMD_STRIDE_SIZE` which means we always have at least `SIMD_STRIDE_SIZE` elements to munch next time.
511 if offset > len_minus_stride {
512 break;
513 }
514 }
515 }
516 }
517 }
518 while offset < len {
519 // Safety: uses len invariant here and below
520 let code_unit = *(src.add(offset));
521 if code_unit > 127 {
522 // Safety: upholds safety-usable invariant
523 return Some((code_unit, offset));
524 }
525 *(dst.add(offset)) = code_unit as $dst_unit;
526 offset += 1;
527 }
528 None
529 }
530 };
531}
532
533#[allow(unused_macros)]
534macro_rules! ascii_simd_check_align_unrolled {
535 (
536 $name:ident,
537 $src_unit:ty,
538 $dst_unit:ty,
539 // Safety: This function must require aligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
540 $stride_both_aligned:ident,
541 // Safety: This function must require aligned/unaligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
542 $stride_src_aligned:ident,
543 // Safety: This function must require unaligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
544 $stride_neither_aligned:ident,
545 // Safety: This function must require aligned src/dest that are valid for reading/writing 2*SIMD_STRIDE_SIZE src_unit/dst_unit
546 $double_stride_both_aligned:ident,
547 // Safety: This function must require aligned/unaligned src/dest that are valid for reading/writing 2*SIMD_STRIDE_SIZE src_unit/dst_unit
548 $double_stride_src_aligned:ident
549 ) => {
550 /// Safety: src/dst must be valid for reads/writes of `len` elements of their units.
551 ///
552 /// Safety-usable invariant: will return Some() when it encounters non-ASCII, with the first element in the Some being
553 /// guaranteed to be non-ASCII (> 127), and the second being the offset where it is found #[inline(always)]
554 pub unsafe fn $name(
555 src: *const $src_unit,
556 dst: *mut $dst_unit,
557 len: usize,
558 ) -> Option<($src_unit, usize)> {
559 let unit_size = ::core::mem::size_of::<$src_unit>();
560 let mut offset = 0usize;
561 // This loop is only broken out of as a goto forward without
562 // actually looping
563 'outer: loop {
564 // Safety: if this check succeeds we're valid for reading/writing at least `SIMD_STRIDE_SIZE` elements.
565 if SIMD_STRIDE_SIZE <= len {
566 // First, process one unaligned
567 // Safety: this is safe to call since we're valid for this read/write
568 if !$stride_neither_aligned(src, dst) {
569 break 'outer;
570 }
571 offset = SIMD_STRIDE_SIZE;
572
573 // We have now seen 16 ASCII bytes. Let's guess that
574 // there will be enough more to justify more expense
575 // in the case of non-ASCII.
576 // Use aligned reads for the sake of old microachitectures.
577 //
578 // Safety: this correctly calculates the number of src_units that need to be read before the remaining list is aligned.
579 // This is less that SIMD_ALIGNMENT, which is also SIMD_STRIDE_SIZE (as documented)
580 let until_alignment = ((SIMD_ALIGNMENT
581 - ((src.add(offset) as usize) & SIMD_ALIGNMENT_MASK))
582 & SIMD_ALIGNMENT_MASK)
583 / unit_size;
584 // Safety: This addition won't overflow, because even in the 32-bit PAE case the
585 // address space holds enough code that the slice length can't be that
586 // close to address space size.
587 // offset now equals SIMD_STRIDE_SIZE, hence times 3 below.
588 //
589 // Safety: if this check succeeds we're valid for reading/writing at least `2 * SIMD_STRIDE_SIZE` elements plus `until_alignment`.
590 // The extra SIMD_STRIDE_SIZE in the condition is because `offset` is already `SIMD_STRIDE_SIZE`.
591 if until_alignment + (SIMD_STRIDE_SIZE * 3) <= len {
592 if until_alignment != 0 {
593 // Safety: this is safe to call since we're valid for this read/write (and more), and don't care about alignment
594 // This will copy over bytes that get decoded twice since it's not incrementing `offset` by SIMD_STRIDE_SIZE. This is fine.
595 if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
596 break;
597 }
598 offset += until_alignment;
599 }
600 // Safety: At this point we're valid for reading/writing 2*SIMD_STRIDE_SIZE elements
601 // Safety: Now `offset` is aligned for `src`
602 let len_minus_stride_times_two = len - (SIMD_STRIDE_SIZE * 2);
603 // Safety: This is whether dst is aligned
604 let dst_masked = (dst.add(offset) as usize) & SIMD_ALIGNMENT_MASK;
605 if dst_masked == 0 {
606 loop {
607 // Safety: both are aligned, we can call the aligned function. We're valid for reading/writing double stride from the initial condition
608 // and the loop break condition below
609 if let Some(advance) =
610 $double_stride_both_aligned(src.add(offset), dst.add(offset))
611 {
612 offset += advance;
613 let code_unit = *(src.add(offset));
614 // Safety: uses safety-usable invariant on ascii_to_ascii_simd_double_stride to return
615 // guaranteed non-ascii
616 return Some((code_unit, offset));
617 }
618 offset += SIMD_STRIDE_SIZE * 2;
619 // Safety: This is `offset > len - 2 * SIMD_STRIDE_SIZE` which means we always have at least `2 * SIMD_STRIDE_SIZE` elements to munch next time.
620 if offset > len_minus_stride_times_two {
621 break;
622 }
623 }
624 // Safety: We're valid for reading/writing one more, and can still assume alignment
625 if offset + SIMD_STRIDE_SIZE <= len {
626 if !$stride_both_aligned(src.add(offset), dst.add(offset)) {
627 break 'outer;
628 }
629 offset += SIMD_STRIDE_SIZE;
630 }
631 } else {
632 loop {
633 // Safety: only src is aligned here. We're valid for reading/writing double stride from the initial condition
634 // and the loop break condition below
635 if let Some(advance) =
636 $double_stride_src_aligned(src.add(offset), dst.add(offset))
637 {
638 offset += advance;
639 let code_unit = *(src.add(offset));
640 // Safety: uses safety-usable invariant on ascii_to_ascii_simd_double_stride to return
641 // guaranteed non-ascii
642 return Some((code_unit, offset));
643 }
644 offset += SIMD_STRIDE_SIZE * 2;
645 // Safety: This is `offset > len - 2 * SIMD_STRIDE_SIZE` which means we always have at least `2 * SIMD_STRIDE_SIZE` elements to munch next time.
646
647 if offset > len_minus_stride_times_two {
648 break;
649 }
650 }
651 // Safety: We're valid for reading/writing one more, and can still assume alignment
652 if offset + SIMD_STRIDE_SIZE <= len {
653 if !$stride_src_aligned(src.add(offset), dst.add(offset)) {
654 break 'outer;
655 }
656 offset += SIMD_STRIDE_SIZE;
657 }
658 }
659 } else {
660 // At most two iterations, so unroll
661 if offset + SIMD_STRIDE_SIZE <= len {
662 // Safety: The check above ensures we're allowed to read/write this, and we don't use alignment
663 if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
664 break;
665 }
666 offset += SIMD_STRIDE_SIZE;
667 if offset + SIMD_STRIDE_SIZE <= len {
668 // Safety: The check above ensures we're allowed to read/write this, and we don't use alignment
669 if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
670 break;
671 }
672 offset += SIMD_STRIDE_SIZE;
673 }
674 }
675 }
676 }
677 break 'outer;
678 }
679 while offset < len {
680 // Safety: relies straightforwardly on the `len` invariant
681 let code_unit = *(src.add(offset));
682 if code_unit > 127 {
683 // Safety-usable invariant upheld here
684 return Some((code_unit, offset));
685 }
686 *(dst.add(offset)) = code_unit as $dst_unit;
687 offset += 1;
688 }
689 None
690 }
691 };
692}
693
694#[allow(unused_macros)]
695macro_rules! latin1_simd_check_align {
696 (
697 $name:ident,
698 $src_unit:ty,
699 $dst_unit:ty,
700 // Safety: This function must require aligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
701 $stride_both_aligned:ident,
702 // Safety: This function must require aligned/unaligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
703 $stride_src_aligned:ident,
704 // Safety: This function must require unaligned/aligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
705 $stride_dst_aligned:ident,
706 // Safety: This function must require unaligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
707 $stride_neither_aligned:ident
708
709 ) => {
710 /// Safety: src/dst must be valid for reads/writes of `len` elements of their units.
711 #[inline(always)]
712 pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
713 let mut offset = 0usize;
714 // Safety: if this check succeeds we're valid for reading/writing at least `SIMD_STRIDE_SIZE` elements.
715 if SIMD_STRIDE_SIZE <= len {
716 let len_minus_stride = len - SIMD_STRIDE_SIZE;
717 // Whether dst is aligned
718 let dst_masked = (dst as usize) & SIMD_ALIGNMENT_MASK;
719 // Whether src is aligned
720 if ((src as usize) & SIMD_ALIGNMENT_MASK) == 0 {
721 if dst_masked == 0 {
722 loop {
723 // Safety: Both were aligned, we can use the aligned function
724 $stride_both_aligned(src.add(offset), dst.add(offset));
725 offset += SIMD_STRIDE_SIZE;
726 // Safety: This is `offset > len - SIMD_STRIDE_SIZE`, which means in the next iteration we're valid for
727 // reading/writing at least SIMD_STRIDE_SIZE elements.
728 if offset > len_minus_stride {
729 break;
730 }
731 }
732 } else {
733 loop {
734 // Safety: src was aligned, dst was not
735 $stride_src_aligned(src.add(offset), dst.add(offset));
736 offset += SIMD_STRIDE_SIZE;
737 // Safety: This is `offset > len - SIMD_STRIDE_SIZE`, which means in the next iteration we're valid for
738 // reading/writing at least SIMD_STRIDE_SIZE elements.
739 if offset > len_minus_stride {
740 break;
741 }
742 }
743 }
744 } else {
745 if dst_masked == 0 {
746 loop {
747 // Safety: src was aligned, dst was not
748 $stride_dst_aligned(src.add(offset), dst.add(offset));
749 offset += SIMD_STRIDE_SIZE;
750 // Safety: This is `offset > len - SIMD_STRIDE_SIZE`, which means in the next iteration we're valid for
751 // reading/writing at least SIMD_STRIDE_SIZE elements.
752 if offset > len_minus_stride {
753 break;
754 }
755 }
756 } else {
757 loop {
758 // Safety: Neither were aligned
759 $stride_neither_aligned(src.add(offset), dst.add(offset));
760 offset += SIMD_STRIDE_SIZE;
761 // Safety: This is `offset > len - SIMD_STRIDE_SIZE`, which means in the next iteration we're valid for
762 // reading/writing at least SIMD_STRIDE_SIZE elements.
763 if offset > len_minus_stride {
764 break;
765 }
766 }
767 }
768 }
769 }
770 while offset < len {
771 // Safety: relies straightforwardly on the `len` invariant
772 let code_unit = *(src.add(offset));
773 *(dst.add(offset)) = code_unit as $dst_unit;
774 offset += 1;
775 }
776 }
777 };
778}
779
780#[allow(unused_macros)]
781macro_rules! latin1_simd_check_align_unrolled {
782 (
783 $name:ident,
784 $src_unit:ty,
785 $dst_unit:ty,
786 // Safety: This function must require aligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
787 $stride_both_aligned:ident,
788 // Safety: This function must require aligned/unaligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
789 $stride_src_aligned:ident,
790 // Safety: This function must require unaligned/aligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
791 $stride_dst_aligned:ident,
792 // Safety: This function must require unaligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
793 $stride_neither_aligned:ident
794 ) => {
795 /// Safety: src/dst must be valid for reads/writes of `len` elements of their units.
796 #[inline(always)]
797 pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
798 let unit_size = ::core::mem::size_of::<$src_unit>();
799 let mut offset = 0usize;
800 // Safety: if this check succeeds we're valid for reading/writing at least `SIMD_STRIDE_SIZE` elements.
801 if SIMD_STRIDE_SIZE <= len {
802 // Safety: this correctly calculates the number of src_units that need to be read before the remaining list is aligned.
803 // This is by definition less than SIMD_STRIDE_SIZE.
804 let mut until_alignment = ((SIMD_STRIDE_SIZE
805 - ((src as usize) & SIMD_ALIGNMENT_MASK))
806 & SIMD_ALIGNMENT_MASK)
807 / unit_size;
808 while until_alignment != 0 {
809 // Safety: This is a straightforward copy, since until_alignment is < SIMD_STRIDE_SIZE < len, this is in-bounds
810 *(dst.add(offset)) = *(src.add(offset)) as $dst_unit;
811 offset += 1;
812 until_alignment -= 1;
813 }
814 // Safety: here offset will be `until_alignment`, i.e. enough to align `src`.
815 let len_minus_stride = len - SIMD_STRIDE_SIZE;
816 // Safety: if this check succeeds we're valid for reading/writing at least `2 * SIMD_STRIDE_SIZE` elements.
817 if offset + SIMD_STRIDE_SIZE * 2 <= len {
818 let len_minus_stride_times_two = len_minus_stride - SIMD_STRIDE_SIZE;
819 // Safety: at this point src is known to be aligned at offset, dst is not.
820 if (dst.add(offset) as usize) & SIMD_ALIGNMENT_MASK == 0 {
821 loop {
822 // Safety: We checked alignment of dst above, we can use the alignment functions. We're allowed to read/write 2*SIMD_STRIDE_SIZE elements, which we do.
823 $stride_both_aligned(src.add(offset), dst.add(offset));
824 offset += SIMD_STRIDE_SIZE;
825 $stride_both_aligned(src.add(offset), dst.add(offset));
826 offset += SIMD_STRIDE_SIZE;
827 // Safety: This is `offset > len - 2 * SIMD_STRIDE_SIZE` which means we always have at least `2 * SIMD_STRIDE_SIZE` elements to munch next time.
828 if offset > len_minus_stride_times_two {
829 break;
830 }
831 }
832 } else {
833 loop {
834 // Safety: we ensured alignment of src already.
835 $stride_src_aligned(src.add(offset), dst.add(offset));
836 offset += SIMD_STRIDE_SIZE;
837 $stride_src_aligned(src.add(offset), dst.add(offset));
838 offset += SIMD_STRIDE_SIZE;
839 // Safety: This is `offset > len - 2 * SIMD_STRIDE_SIZE` which means we always have at least `2 * SIMD_STRIDE_SIZE` elements to munch next time.
840 if offset > len_minus_stride_times_two {
841 break;
842 }
843 }
844 }
845 }
846 // Safety: This is `offset > len - SIMD_STRIDE_SIZE` which means we are valid to munch SIMD_STRIDE_SIZE more elements, which we do
847 if offset < len_minus_stride {
848 $stride_src_aligned(src.add(offset), dst.add(offset));
849 offset += SIMD_STRIDE_SIZE;
850 }
851 }
852 while offset < len {
853 // Safety: uses len invariant here and below
854 let code_unit = *(src.add(offset));
855 // On x86_64, this loop autovectorizes but in the pack
856 // case there are instructions whose purpose is to make sure
857 // each u16 in the vector is truncated before packing. However,
858 // since we don't care about saturating behavior of SSE2 packing
859 // when the input isn't Latin1, those instructions are useless.
860 // Unfortunately, using the `assume` intrinsic to lie to the
861 // optimizer doesn't make LLVM omit the trunctation that we
862 // don't need. Possibly this loop could be manually optimized
863 // to do the sort of thing that LLVM does but without the
864 // ANDing the read vectors of u16 with a constant that discards
865 // the high half of each u16. As far as I can tell, the
866 // optimization assumes that doing a SIMD read past the end of
867 // the array is OK.
868 *(dst.add(offset)) = code_unit as $dst_unit;
869 offset += 1;
870 }
871 }
872 };
873}
874
875#[allow(unused_macros)]
876macro_rules! ascii_simd_unalign {
877 // Safety: stride_neither_aligned must be a function that requires src/dest be valid for unaligned reads/writes for SIMD_STRIDE_SIZE elements of type src_unit/dest_unit
878 ($name:ident, $src_unit:ty, $dst_unit:ty, $stride_neither_aligned:ident) => {
879 /// Safety: src and dst must be valid for reads/writes of len elements of type src_unit/dst_unit
880 ///
881 /// Safety-usable invariant: will return Some() when it encounters non-ASCII, with the first element in the Some being
882 /// guaranteed to be non-ASCII (> 127), and the second being the offset where it is found
883 #[inline(always)]
884 pub unsafe fn $name(
885 src: *const $src_unit,
886 dst: *mut $dst_unit,
887 len: usize,
888 ) -> Option<($src_unit, usize)> {
889 let mut offset = 0usize;
890 // Safety: if this check succeeds we're valid for reading/writing at least `stride` elements.
891 if SIMD_STRIDE_SIZE <= len {
892 let len_minus_stride = len - SIMD_STRIDE_SIZE;
893 loop {
894 // Safety: We know we're valid for `stride` reads/writes, so we can call this function. We don't need alignment.
895 if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
896 break;
897 }
898 offset += SIMD_STRIDE_SIZE;
899 // This is `offset > len - stride` which means we always have at least `stride` elements to munch next time.
900 if offset > len_minus_stride {
901 break;
902 }
903 }
904 }
905 while offset < len {
906 // Safety: Uses len invariant here and below
907 let code_unit = *(src.add(offset));
908 if code_unit > 127 {
909 // Safety-usable invariant upheld here
910 return Some((code_unit, offset));
911 }
912 *(dst.add(offset)) = code_unit as $dst_unit;
913 offset += 1;
914 }
915 None
916 }
917 };
918}
919
920#[allow(unused_macros)]
921macro_rules! latin1_simd_unalign {
922 // Safety: stride_neither_aligned must be a function that requires src/dest be valid for unaligned reads/writes for SIMD_STRIDE_SIZE elements of type src_unit/dest_unit
923 ($name:ident, $src_unit:ty, $dst_unit:ty, $stride_neither_aligned:ident) => {
924 /// Safety: src and dst must be valid for unaligned reads/writes of len elements of type src_unit/dst_unit
925 #[inline(always)]
926 pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
927 let mut offset = 0usize;
928 // Safety: if this check succeeds we're valid for reading/writing at least `stride` elements.
929 if SIMD_STRIDE_SIZE <= len {
930 let len_minus_stride = len - SIMD_STRIDE_SIZE;
931 loop {
932 // Safety: We know we're valid for `stride` reads/writes, so we can call this function. We don't need alignment.
933 $stride_neither_aligned(src.add(offset), dst.add(offset));
934 offset += SIMD_STRIDE_SIZE;
935 // This is `offset > len - stride` which means we always have at least `stride` elements to munch next time.
936 if offset > len_minus_stride {
937 break;
938 }
939 }
940 }
941 while offset < len {
942 // Safety: Uses len invariant here
943 let code_unit = *(src.add(offset));
944 *(dst.add(offset)) = code_unit as $dst_unit;
945 offset += 1;
946 }
947 }
948 };
949}
950
951#[allow(unused_macros)]
952macro_rules! ascii_to_ascii_simd_stride {
953 // Safety: load/store must be valid for 16 bytes of read/write, which may be unaligned. (candidates: `(load|store)(16|8)_(unaligned|aligned)` functions)
954 ($name:ident, $load:ident, $store:ident) => {
955 /// Safety: src and dst must be valid for 16 bytes of read/write according to
956 /// the $load/$store fn, which may allow for unaligned reads/writes or require
957 /// alignment to either 16x8 or u8x16.
958 #[inline(always)]
959 pub unsafe fn $name(src: *const u8, dst: *mut u8) -> bool {
960 let simd = $load(src);
961 if !simd_is_ascii(simd) {
962 return false;
963 }
964 $store(dst, simd);
965 true
966 }
967 };
968}
969
970#[allow(unused_macros)]
971macro_rules! ascii_to_ascii_simd_double_stride {
972 // Safety: store must be valid for 32 bytes of write, which may be unaligned (candidates: `store(8|16)_(aligned|unaligned)`)
973 ($name:ident, $store:ident) => {
974 /// Safety: src must be valid for 32 bytes of aligned u8x16 read
975 /// dst must be valid for 32 bytes of unaligned write according to
976 /// the $store fn, which may allow for unaligned writes or require
977 /// alignment to either 16x8 or u8x16.
978 ///
979 /// Safety-usable invariant: Returns Some(index) if the element at `index` is invalid ASCII
980 #[inline(always)]
981 pub unsafe fn $name(src: *const u8, dst: *mut u8) -> Option<usize> {
982 let first = load16_aligned(src);
983 let second = load16_aligned(src.add(SIMD_STRIDE_SIZE));
984 $store(dst, first);
985 if unlikely(!simd_is_ascii(first | second)) {
986 // Safety: mask_ascii produces a mask of all the high bits.
987 let mask_first = mask_ascii(first);
988 if mask_first != 0 {
989 // Safety: on little endian systems this will be the number of ascii bytes
990 // before the first non-ascii, i.e. valid for indexing src
991 // TODO SAFETY: What about big-endian systems?
992 return Some(mask_first.trailing_zeros() as usize);
993 }
994 $store(dst.add(SIMD_STRIDE_SIZE), second);
995 let mask_second = mask_ascii(second);
996 // Safety: on little endian systems this will be the number of ascii bytes
997 // before the first non-ascii, i.e. valid for indexing src
998 return Some(SIMD_STRIDE_SIZE + mask_second.trailing_zeros() as usize);
999 }
1000 $store(dst.add(SIMD_STRIDE_SIZE), second);
1001 None
1002 }
1003 };
1004}
1005
1006#[allow(unused_macros)]
1007macro_rules! ascii_to_basic_latin_simd_stride {
1008 // Safety: load/store must be valid for 16 bytes of read/write, which may be unaligned. (candidates: `(load|store)(16|8)_(unaligned|aligned)` functions)
1009 ($name:ident, $load:ident, $store:ident) => {
1010 /// Safety: src and dst must be valid for 16/32 bytes of read/write according to
1011 /// the $load/$store fn, which may allow for unaligned reads/writes or require
1012 /// alignment to either 16x8 or u8x16.
1013 #[inline(always)]
1014 pub unsafe fn $name(src: *const u8, dst: *mut u16) -> bool {
1015 let simd = $load(src);
1016 if !simd_is_ascii(simd) {
1017 return false;
1018 }
1019 let (first, second) = simd_unpack(simd);
1020 $store(dst, first);
1021 $store(dst.add(8), second);
1022 true
1023 }
1024 };
1025}
1026
1027#[allow(unused_macros)]
1028macro_rules! ascii_to_basic_latin_simd_double_stride {
1029 // Safety: store must be valid for 16 bytes of write, which may be unaligned
1030 ($name:ident, $store:ident) => {
1031 /// Safety: src must be valid for 2*SIMD_STRIDE_SIZE bytes of aligned reads,
1032 /// aligned to either 16x8 or u8x16.
1033 /// dst must be valid for 2*SIMD_STRIDE_SIZE bytes of aligned or unaligned reads
1034 #[inline(always)]
1035 pub unsafe fn $name(src: *const u8, dst: *mut u16) -> Option<usize> {
1036 let first = load16_aligned(src);
1037 let second = load16_aligned(src.add(SIMD_STRIDE_SIZE));
1038 let (a, b) = simd_unpack(first);
1039 $store(dst, a);
1040 // Safety: divide by 2 since it's a u16 pointer
1041 $store(dst.add(SIMD_STRIDE_SIZE / 2), b);
1042 if unlikely(!simd_is_ascii(first | second)) {
1043 let mask_first = mask_ascii(first);
1044 if mask_first != 0 {
1045 return Some(mask_first.trailing_zeros() as usize);
1046 }
1047 let (c, d) = simd_unpack(second);
1048 $store(dst.add(SIMD_STRIDE_SIZE), c);
1049 $store(dst.add(SIMD_STRIDE_SIZE + (SIMD_STRIDE_SIZE / 2)), d);
1050 let mask_second = mask_ascii(second);
1051 return Some(SIMD_STRIDE_SIZE + mask_second.trailing_zeros() as usize);
1052 }
1053 let (c, d) = simd_unpack(second);
1054 $store(dst.add(SIMD_STRIDE_SIZE), c);
1055 $store(dst.add(SIMD_STRIDE_SIZE + (SIMD_STRIDE_SIZE / 2)), d);
1056 None
1057 }
1058 };
1059}
1060
1061#[allow(unused_macros)]
1062macro_rules! unpack_simd_stride {
1063 // Safety: load/store must be valid for 16 bytes of read/write, which may be unaligned. (candidates: `(load|store)(16|8)_(unaligned|aligned)` functions)
1064 ($name:ident, $load:ident, $store:ident) => {
1065 /// Safety: src and dst must be valid for 16 bytes of read/write according to
1066 /// the $load/$store fn, which may allow for unaligned reads/writes or require
1067 /// alignment to either 16x8 or u8x16.
1068 #[inline(always)]
1069 pub unsafe fn $name(src: *const u8, dst: *mut u16) {
1070 let simd = $load(src);
1071 let (first, second) = simd_unpack(simd);
1072 $store(dst, first);
1073 $store(dst.add(8), second);
1074 }
1075 };
1076}
1077
1078#[allow(unused_macros)]
1079macro_rules! basic_latin_to_ascii_simd_stride {
1080 // Safety: load/store must be valid for 16 bytes of read/write, which may be unaligned. (candidates: `(load|store)(16|8)_(unaligned|aligned)` functions)
1081 ($name:ident, $load:ident, $store:ident) => {
1082 /// Safety: src and dst must be valid for 32/16 bytes of read/write according to
1083 /// the $load/$store fn, which may allow for unaligned reads/writes or require
1084 /// alignment to either 16x8 or u8x16.
1085 #[inline(always)]
1086 pub unsafe fn $name(src: *const u16, dst: *mut u8) -> bool {
1087 let first = $load(src);
1088 let second = $load(src.add(8));
1089 if simd_is_basic_latin(first | second) {
1090 $store(dst, simd_pack(first, second));
1091 true
1092 } else {
1093 false
1094 }
1095 }
1096 };
1097}
1098
1099#[allow(unused_macros)]
1100macro_rules! pack_simd_stride {
1101 // Safety: load/store must be valid for 16 bytes of read/write, which may be unaligned. (candidates: `(load|store)(16|8)_(unaligned|aligned)` functions)
1102 ($name:ident, $load:ident, $store:ident) => {
1103 /// Safety: src and dst must be valid for 32/16 bytes of read/write according to
1104 /// the $load/$store fn, which may allow for unaligned reads/writes or require
1105 /// alignment to either 16x8 or u8x16.
1106 #[inline(always)]
1107 pub unsafe fn $name(src: *const u16, dst: *mut u8) {
1108 let first = $load(src);
1109 let second = $load(src.add(8));
1110 $store(dst, simd_pack(first, second));
1111 }
1112 };
1113}
1114
1115cfg_if! {
1116 if #[cfg(all(feature = "simd-accel", target_endian = "little", target_arch = "aarch64"))] {
1117 // SIMD with the same instructions for aligned and unaligned loads and stores
1118
1119 pub const SIMD_STRIDE_SIZE: usize = 16;
1120
1121 pub const MAX_STRIDE_SIZE: usize = 16;
1122
1123// pub const ALIGNMENT: usize = 8;
1124
1125 pub const ALU_STRIDE_SIZE: usize = 16;
1126
1127 pub const ALU_ALIGNMENT: usize = 8;
1128
1129 pub const ALU_ALIGNMENT_MASK: usize = 7;
1130
1131 // Safety for stride macros: We stick to the load8_aligned/etc family of functions. We consistently produce
1132 // neither_unaligned variants using only unaligned inputs.
1133 ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_neither_aligned, load16_unaligned, store16_unaligned);
1134
1135 ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_neither_aligned, load16_unaligned, store8_unaligned);
1136 unpack_simd_stride!(unpack_stride_neither_aligned, load16_unaligned, store8_unaligned);
1137
1138 basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_neither_aligned, load8_unaligned, store16_unaligned);
1139 pack_simd_stride!(pack_stride_neither_aligned, load8_unaligned, store16_unaligned);
1140
1141 // Safety for conversion macros: We use the unalign macro with unalign functions above. All stride functions were produced
1142 // by stride macros that universally munch a single SIMD_STRIDE_SIZE worth of elements.
1143 ascii_simd_unalign!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride_neither_aligned);
1144 ascii_simd_unalign!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_neither_aligned);
1145 ascii_simd_unalign!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_neither_aligned);
1146 latin1_simd_unalign!(unpack_latin1, u8, u16, unpack_stride_neither_aligned);
1147 latin1_simd_unalign!(pack_latin1, u16, u8, pack_stride_neither_aligned);
1148 } else if #[cfg(all(feature = "simd-accel", target_endian = "little", target_feature = "neon"))] {
1149 // SIMD with different instructions for aligned and unaligned loads and stores.
1150 //
1151 // Newer microarchitectures are not supposed to have a performance difference between
1152 // aligned and unaligned SSE2 loads and stores when the address is actually aligned,
1153 // but the benchmark results I see don't agree.
1154
1155 pub const SIMD_STRIDE_SIZE: usize = 16;
1156
1157 pub const MAX_STRIDE_SIZE: usize = 16;
1158
1159 pub const SIMD_ALIGNMENT_MASK: usize = 15;
1160
1161 // Safety for stride macros: We stick to the load8_aligned/etc family of functions. We consistently name
1162 // aligned/unaligned functions according to src/dst being aligned/unaligned
1163
1164 ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_both_aligned, load16_aligned, store16_aligned);
1165 ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_src_aligned, load16_aligned, store16_unaligned);
1166 ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_dst_aligned, load16_unaligned, store16_aligned);
1167 ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_neither_aligned, load16_unaligned, store16_unaligned);
1168
1169 ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_both_aligned, load16_aligned, store8_aligned);
1170 ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_src_aligned, load16_aligned, store8_unaligned);
1171 ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_dst_aligned, load16_unaligned, store8_aligned);
1172 ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_neither_aligned, load16_unaligned, store8_unaligned);
1173
1174 unpack_simd_stride!(unpack_stride_both_aligned, load16_aligned, store8_aligned);
1175 unpack_simd_stride!(unpack_stride_src_aligned, load16_aligned, store8_unaligned);
1176 unpack_simd_stride!(unpack_stride_dst_aligned, load16_unaligned, store8_aligned);
1177 unpack_simd_stride!(unpack_stride_neither_aligned, load16_unaligned, store8_unaligned);
1178
1179 basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_both_aligned, load8_aligned, store16_aligned);
1180 basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_src_aligned, load8_aligned, store16_unaligned);
1181 basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_dst_aligned, load8_unaligned, store16_aligned);
1182 basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_neither_aligned, load8_unaligned, store16_unaligned);
1183
1184 pack_simd_stride!(pack_stride_both_aligned, load8_aligned, store16_aligned);
1185 pack_simd_stride!(pack_stride_src_aligned, load8_aligned, store16_unaligned);
1186 pack_simd_stride!(pack_stride_dst_aligned, load8_unaligned, store16_aligned);
1187 pack_simd_stride!(pack_stride_neither_aligned, load8_unaligned, store16_unaligned);
1188
1189 // Safety for conversion macros: We use the correct pattern of both/src/dst/neither here. All stride functions were produced
1190 // by stride macros that universally munch a single SIMD_STRIDE_SIZE worth of elements.
1191
1192 ascii_simd_check_align!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride_both_aligned, ascii_to_ascii_stride_src_aligned, ascii_to_ascii_stride_dst_aligned, ascii_to_ascii_stride_neither_aligned);
1193 ascii_simd_check_align!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_both_aligned, ascii_to_basic_latin_stride_src_aligned, ascii_to_basic_latin_stride_dst_aligned, ascii_to_basic_latin_stride_neither_aligned);
1194 ascii_simd_check_align!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_both_aligned, basic_latin_to_ascii_stride_src_aligned, basic_latin_to_ascii_stride_dst_aligned, basic_latin_to_ascii_stride_neither_aligned);
1195 latin1_simd_check_align!(unpack_latin1, u8, u16, unpack_stride_both_aligned, unpack_stride_src_aligned, unpack_stride_dst_aligned, unpack_stride_neither_aligned);
1196 latin1_simd_check_align!(pack_latin1, u16, u8, pack_stride_both_aligned, pack_stride_src_aligned, pack_stride_dst_aligned, pack_stride_neither_aligned);
1197 } else if #[cfg(all(feature = "simd-accel", target_feature = "sse2"))] {
1198 // SIMD with different instructions for aligned and unaligned loads and stores.
1199 //
1200 // Newer microarchitectures are not supposed to have a performance difference between
1201 // aligned and unaligned SSE2 loads and stores when the address is actually aligned,
1202 // but the benchmark results I see don't agree.
1203
1204 pub const SIMD_STRIDE_SIZE: usize = 16;
1205
1206 /// Safety-usable invariant: This should be identical to SIMD_STRIDE_SIZE (used by ascii_simd_check_align_unrolled)
1207 pub const SIMD_ALIGNMENT: usize = 16;
1208
1209 pub const MAX_STRIDE_SIZE: usize = 16;
1210
1211 pub const SIMD_ALIGNMENT_MASK: usize = 15;
1212
1213 // Safety for stride macros: We stick to the load8_aligned/etc family of functions. We consistently name
1214 // aligned/unaligned functions according to src/dst being aligned/unaligned
1215
1216 ascii_to_ascii_simd_double_stride!(ascii_to_ascii_simd_double_stride_both_aligned, store16_aligned);
1217 ascii_to_ascii_simd_double_stride!(ascii_to_ascii_simd_double_stride_src_aligned, store16_unaligned);
1218
1219 ascii_to_basic_latin_simd_double_stride!(ascii_to_basic_latin_simd_double_stride_both_aligned, store8_aligned);
1220 ascii_to_basic_latin_simd_double_stride!(ascii_to_basic_latin_simd_double_stride_src_aligned, store8_unaligned);
1221
1222 ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_both_aligned, load16_aligned, store16_aligned);
1223 ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_src_aligned, load16_aligned, store16_unaligned);
1224 ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_neither_aligned, load16_unaligned, store16_unaligned);
1225
1226 ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_both_aligned, load16_aligned, store8_aligned);
1227 ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_src_aligned, load16_aligned, store8_unaligned);
1228 ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_neither_aligned, load16_unaligned, store8_unaligned);
1229
1230 unpack_simd_stride!(unpack_stride_both_aligned, load16_aligned, store8_aligned);
1231 unpack_simd_stride!(unpack_stride_src_aligned, load16_aligned, store8_unaligned);
1232
1233 basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_both_aligned, load8_aligned, store16_aligned);
1234 basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_src_aligned, load8_aligned, store16_unaligned);
1235 basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_dst_aligned, load8_unaligned, store16_aligned);
1236 basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_neither_aligned, load8_unaligned, store16_unaligned);
1237
1238 pack_simd_stride!(pack_stride_both_aligned, load8_aligned, store16_aligned);
1239 pack_simd_stride!(pack_stride_src_aligned, load8_aligned, store16_unaligned);
1240
1241 // Safety for conversion macros: We use the correct pattern of both/src/dst/neither/double_both/double_src here. All stride functions were produced
1242 // by stride macros that universally munch a single SIMD_STRIDE_SIZE worth of elements.
1243
1244 ascii_simd_check_align_unrolled!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride_both_aligned, ascii_to_ascii_stride_src_aligned, ascii_to_ascii_stride_neither_aligned, ascii_to_ascii_simd_double_stride_both_aligned, ascii_to_ascii_simd_double_stride_src_aligned);
1245 ascii_simd_check_align_unrolled!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_both_aligned, ascii_to_basic_latin_stride_src_aligned, ascii_to_basic_latin_stride_neither_aligned, ascii_to_basic_latin_simd_double_stride_both_aligned, ascii_to_basic_latin_simd_double_stride_src_aligned);
1246
1247 ascii_simd_check_align!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_both_aligned, basic_latin_to_ascii_stride_src_aligned, basic_latin_to_ascii_stride_dst_aligned, basic_latin_to_ascii_stride_neither_aligned);
1248 latin1_simd_check_align_unrolled!(unpack_latin1, u8, u16, unpack_stride_both_aligned, unpack_stride_src_aligned, unpack_stride_dst_aligned, unpack_stride_neither_aligned);
1249 latin1_simd_check_align_unrolled!(pack_latin1, u16, u8, pack_stride_both_aligned, pack_stride_src_aligned, pack_stride_dst_aligned, pack_stride_neither_aligned);
1250 } else if #[cfg(all(target_endian = "little", target_pointer_width = "64"))] {
1251 // Aligned ALU word, little-endian, 64-bit
1252
1253 /// Safety invariant: this is the amount of bytes consumed by
1254 /// unpack_alu. This will be twice the pointer width, as it consumes two usizes.
1255 /// This is also the number of bytes produced by pack_alu.
1256 /// This is also the number of u16 code units produced/consumed by unpack_alu/pack_alu respectively.
1257 pub const ALU_STRIDE_SIZE: usize = 16;
1258
1259 pub const MAX_STRIDE_SIZE: usize = 16;
1260
1261 // Safety invariant: this is the pointer width in bytes
1262 pub const ALU_ALIGNMENT: usize = 8;
1263
1264 // Safety invariant: this is a mask for getting the bits of a pointer not aligned to ALU_ALIGNMENT
1265 pub const ALU_ALIGNMENT_MASK: usize = 7;
1266
1267 /// Safety: dst must point to valid space for writing four `usize`s
1268 #[inline(always)]
1269 unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1270 let first = ((0x0000_0000_FF00_0000usize & word) << 24) |
1271 ((0x0000_0000_00FF_0000usize & word) << 16) |
1272 ((0x0000_0000_0000_FF00usize & word) << 8) |
1273 (0x0000_0000_0000_00FFusize & word);
1274 let second = ((0xFF00_0000_0000_0000usize & word) >> 8) |
1275 ((0x00FF_0000_0000_0000usize & word) >> 16) |
1276 ((0x0000_FF00_0000_0000usize & word) >> 24) |
1277 ((0x0000_00FF_0000_0000usize & word) >> 32);
1278 let third = ((0x0000_0000_FF00_0000usize & second_word) << 24) |
1279 ((0x0000_0000_00FF_0000usize & second_word) << 16) |
1280 ((0x0000_0000_0000_FF00usize & second_word) << 8) |
1281 (0x0000_0000_0000_00FFusize & second_word);
1282 let fourth = ((0xFF00_0000_0000_0000usize & second_word) >> 8) |
1283 ((0x00FF_0000_0000_0000usize & second_word) >> 16) |
1284 ((0x0000_FF00_0000_0000usize & second_word) >> 24) |
1285 ((0x0000_00FF_0000_0000usize & second_word) >> 32);
1286 // Safety: fn invariant used here
1287 *dst = first;
1288 *(dst.add(1)) = second;
1289 *(dst.add(2)) = third;
1290 *(dst.add(3)) = fourth;
1291 }
1292
1293 /// Safety: dst must point to valid space for writing two `usize`s
1294 #[inline(always)]
1295 unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1296 let word = ((0x00FF_0000_0000_0000usize & second) << 8) |
1297 ((0x0000_00FF_0000_0000usize & second) << 16) |
1298 ((0x0000_0000_00FF_0000usize & second) << 24) |
1299 ((0x0000_0000_0000_00FFusize & second) << 32) |
1300 ((0x00FF_0000_0000_0000usize & first) >> 24) |
1301 ((0x0000_00FF_0000_0000usize & first) >> 16) |
1302 ((0x0000_0000_00FF_0000usize & first) >> 8) |
1303 (0x0000_0000_0000_00FFusize & first);
1304 let second_word = ((0x00FF_0000_0000_0000usize & fourth) << 8) |
1305 ((0x0000_00FF_0000_0000usize & fourth) << 16) |
1306 ((0x0000_0000_00FF_0000usize & fourth) << 24) |
1307 ((0x0000_0000_0000_00FFusize & fourth) << 32) |
1308 ((0x00FF_0000_0000_0000usize & third) >> 24) |
1309 ((0x0000_00FF_0000_0000usize & third) >> 16) |
1310 ((0x0000_0000_00FF_0000usize & third) >> 8) |
1311 (0x0000_0000_0000_00FFusize & third);
1312 // Safety: fn invariant used here
1313 *dst = word;
1314 *(dst.add(1)) = second_word;
1315 }
1316 } else if #[cfg(all(target_endian = "little", target_pointer_width = "32"))] {
1317 // Aligned ALU word, little-endian, 32-bit
1318
1319 /// Safety invariant: this is the amount of bytes consumed by
1320 /// unpack_alu. This will be twice the pointer width, as it consumes two usizes.
1321 /// This is also the number of bytes produced by pack_alu.
1322 /// This is also the number of u16 code units produced/consumed by unpack_alu/pack_alu respectively.
1323 pub const ALU_STRIDE_SIZE: usize = 8;
1324
1325 pub const MAX_STRIDE_SIZE: usize = 8;
1326
1327 // Safety invariant: this is the pointer width in bytes
1328 pub const ALU_ALIGNMENT: usize = 4;
1329
1330 // Safety invariant: this is a mask for getting the bits of a pointer not aligned to ALU_ALIGNMENT
1331 pub const ALU_ALIGNMENT_MASK: usize = 3;
1332
1333 /// Safety: dst must point to valid space for writing four `usize`s
1334 #[inline(always)]
1335 unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1336 let first = ((0x0000_FF00usize & word) << 8) |
1337 (0x0000_00FFusize & word);
1338 let second = ((0xFF00_0000usize & word) >> 8) |
1339 ((0x00FF_0000usize & word) >> 16);
1340 let third = ((0x0000_FF00usize & second_word) << 8) |
1341 (0x0000_00FFusize & second_word);
1342 let fourth = ((0xFF00_0000usize & second_word) >> 8) |
1343 ((0x00FF_0000usize & second_word) >> 16);
1344 // Safety: fn invariant used here
1345 *dst = first;
1346 *(dst.add(1)) = second;
1347 *(dst.add(2)) = third;
1348 *(dst.add(3)) = fourth;
1349 }
1350
1351 /// Safety: dst must point to valid space for writing two `usize`s
1352 #[inline(always)]
1353 unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1354 let word = ((0x00FF_0000usize & second) << 8) |
1355 ((0x0000_00FFusize & second) << 16) |
1356 ((0x00FF_0000usize & first) >> 8) |
1357 (0x0000_00FFusize & first);
1358 let second_word = ((0x00FF_0000usize & fourth) << 8) |
1359 ((0x0000_00FFusize & fourth) << 16) |
1360 ((0x00FF_0000usize & third) >> 8) |
1361 (0x0000_00FFusize & third);
1362 // Safety: fn invariant used here
1363 *dst = word;
1364 *(dst.add(1)) = second_word;
1365 }
1366 } else if #[cfg(all(target_endian = "big", target_pointer_width = "64"))] {
1367 // Aligned ALU word, big-endian, 64-bit
1368
1369 /// Safety invariant: this is the amount of bytes consumed by
1370 /// unpack_alu. This will be twice the pointer width, as it consumes two usizes.
1371 /// This is also the number of bytes produced by pack_alu.
1372 /// This is also the number of u16 code units produced/consumed by unpack_alu/pack_alu respectively.
1373 pub const ALU_STRIDE_SIZE: usize = 16;
1374
1375 pub const MAX_STRIDE_SIZE: usize = 16;
1376
1377 // Safety invariant: this is the pointer width in bytes
1378 pub const ALU_ALIGNMENT: usize = 8;
1379
1380 // Safety invariant: this is a mask for getting the bits of a pointer not aligned to ALU_ALIGNMENT
1381 pub const ALU_ALIGNMENT_MASK: usize = 7;
1382
1383 /// Safety: dst must point to valid space for writing four `usize`s
1384 #[inline(always)]
1385 unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1386 let first = ((0xFF00_0000_0000_0000usize & word) >> 8) |
1387 ((0x00FF_0000_0000_0000usize & word) >> 16) |
1388 ((0x0000_FF00_0000_0000usize & word) >> 24) |
1389 ((0x0000_00FF_0000_0000usize & word) >> 32);
1390 let second = ((0x0000_0000_FF00_0000usize & word) << 24) |
1391 ((0x0000_0000_00FF_0000usize & word) << 16) |
1392 ((0x0000_0000_0000_FF00usize & word) << 8) |
1393 (0x0000_0000_0000_00FFusize & word);
1394 let third = ((0xFF00_0000_0000_0000usize & second_word) >> 8) |
1395 ((0x00FF_0000_0000_0000usize & second_word) >> 16) |
1396 ((0x0000_FF00_0000_0000usize & second_word) >> 24) |
1397 ((0x0000_00FF_0000_0000usize & second_word) >> 32);
1398 let fourth = ((0x0000_0000_FF00_0000usize & second_word) << 24) |
1399 ((0x0000_0000_00FF_0000usize & second_word) << 16) |
1400 ((0x0000_0000_0000_FF00usize & second_word) << 8) |
1401 (0x0000_0000_0000_00FFusize & second_word);
1402 // Safety: fn invariant used here
1403 *dst = first;
1404 *(dst.add(1)) = second;
1405 *(dst.add(2)) = third;
1406 *(dst.add(3)) = fourth;
1407 }
1408
1409 /// Safety: dst must point to valid space for writing two `usize`s
1410 #[inline(always)]
1411 unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1412 let word = ((0x00FF0000_00000000usize & first) << 8) |
1413 ((0x000000FF_00000000usize & first) << 16) |
1414 ((0x00000000_00FF0000usize & first) << 24) |
1415 ((0x00000000_000000FFusize & first) << 32) |
1416 ((0x00FF0000_00000000usize & second) >> 24) |
1417 ((0x000000FF_00000000usize & second) >> 16) |
1418 ((0x00000000_00FF0000usize & second) >> 8) |
1419 (0x00000000_000000FFusize & second);
1420 let second_word = ((0x00FF0000_00000000usize & third) << 8) |
1421 ((0x000000FF_00000000usize & third) << 16) |
1422 ((0x00000000_00FF0000usize & third) << 24) |
1423 ((0x00000000_000000FFusize & third) << 32) |
1424 ((0x00FF0000_00000000usize & fourth) >> 24) |
1425 ((0x000000FF_00000000usize & fourth) >> 16) |
1426 ((0x00000000_00FF0000usize & fourth) >> 8) |
1427 (0x00000000_000000FFusize & fourth);
1428 // Safety: fn invariant used here
1429 *dst = word;
1430 *(dst.add(1)) = second_word;
1431 }
1432 } else if #[cfg(all(target_endian = "big", target_pointer_width = "32"))] {
1433 // Aligned ALU word, big-endian, 32-bit
1434
1435 /// Safety invariant: this is the amount of bytes consumed by
1436 /// unpack_alu. This will be twice the pointer width, as it consumes two usizes.
1437 /// This is also the number of bytes produced by pack_alu.
1438 /// This is also the number of u16 code units produced/consumed by unpack_alu/pack_alu respectively.
1439 pub const ALU_STRIDE_SIZE: usize = 8;
1440
1441 pub const MAX_STRIDE_SIZE: usize = 8;
1442
1443 // Safety invariant: this is the pointer width in bytes
1444 pub const ALU_ALIGNMENT: usize = 4;
1445
1446 // Safety invariant: this is a mask for getting the bits of a pointer not aligned to ALU_ALIGNMENT
1447 pub const ALU_ALIGNMENT_MASK: usize = 3;
1448
1449 /// Safety: dst must point to valid space for writing four `usize`s
1450 #[inline(always)]
1451 unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1452 let first = ((0xFF00_0000usize & word) >> 8) |
1453 ((0x00FF_0000usize & word) >> 16);
1454 let second = ((0x0000_FF00usize & word) << 8) |
1455 (0x0000_00FFusize & word);
1456 let third = ((0xFF00_0000usize & second_word) >> 8) |
1457 ((0x00FF_0000usize & second_word) >> 16);
1458 let fourth = ((0x0000_FF00usize & second_word) << 8) |
1459 (0x0000_00FFusize & second_word);
1460 // Safety: fn invariant used here
1461 *dst = first;
1462 *(dst.add(1)) = second;
1463 *(dst.add(2)) = third;
1464 *(dst.add(3)) = fourth;
1465 }
1466
1467 /// Safety: dst must point to valid space for writing two `usize`s
1468 #[inline(always)]
1469 unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1470 let word = ((0x00FF_0000usize & first) << 8) |
1471 ((0x0000_00FFusize & first) << 16) |
1472 ((0x00FF_0000usize & second) >> 8) |
1473 (0x0000_00FFusize & second);
1474 let second_word = ((0x00FF_0000usize & third) << 8) |
1475 ((0x0000_00FFusize & third) << 16) |
1476 ((0x00FF_0000usize & fourth) >> 8) |
1477 (0x0000_00FFusize & fourth);
1478 // Safety: fn invariant used here
1479 *dst = word;
1480 *(dst.add(1)) = second_word;
1481 }
1482 } else {
1483 ascii_naive!(ascii_to_ascii, u8, u8);
1484 ascii_naive!(ascii_to_basic_latin, u8, u16);
1485 ascii_naive!(basic_latin_to_ascii, u16, u8);
1486 }
1487}
1488
1489cfg_if! {
1490 // Safety-usable invariant: this counts the zeroes from the "first byte" of utf-8 data packed into a usize
1491 // with the target endianness
1492 if #[cfg(target_endian = "little")] {
1493 #[allow(dead_code)]
1494 #[inline(always)]
1495 fn count_zeros(word: usize) -> u32 {
1496 word.trailing_zeros()
1497 }
1498 } else {
1499 #[allow(dead_code)]
1500 #[inline(always)]
1501 fn count_zeros(word: usize) -> u32 {
1502 word.leading_zeros()
1503 }
1504 }
1505}
1506
1507cfg_if! {
1508 if #[cfg(all(feature = "simd-accel", target_endian = "little", target_arch = "disabled"))] {
1509 /// Safety-usable invariant: Will return the value and position of the first non-ASCII byte in the slice in a Some if found.
1510 /// In other words, the first element of the Some is always `> 127`
1511 #[inline(always)]
1512 pub fn validate_ascii(slice: &[u8]) -> Option<(u8, usize)> {
1513 let src = slice.as_ptr();
1514 let len = slice.len();
1515 let mut offset = 0usize;
1516 // Safety: if this check succeeds we're valid for reading/writing at least `stride` elements.
1517 if SIMD_STRIDE_SIZE <= len {
1518 let len_minus_stride = len - SIMD_STRIDE_SIZE;
1519 loop {
1520 // Safety: src at offset is valid for a `SIMD_STRIDE_SIZE` read
1521 let simd = unsafe { load16_unaligned(src.add(offset)) };
1522 if !simd_is_ascii(simd) {
1523 break;
1524 }
1525 offset += SIMD_STRIDE_SIZE;
1526 // This is `offset > len - SIMD_STRIDE_SIZE` which means we always have at least `SIMD_STRIDE_SIZE` elements to munch next time.
1527 if offset > len_minus_stride {
1528 break;
1529 }
1530 }
1531 }
1532 while offset < len {
1533 let code_unit = slice[offset];
1534 if code_unit > 127 {
1535 // Safety: Safety-usable invariant upheld here
1536 return Some((code_unit, offset));
1537 }
1538 offset += 1;
1539 }
1540 None
1541 }
1542 } else if #[cfg(all(feature = "simd-accel", target_feature = "sse2"))] {
1543 /// Safety-usable invariant: will return Some() when it encounters non-ASCII, with the first element in the Some being
1544 /// guaranteed to be non-ASCII (> 127), and the second being the offset where it is found
1545 #[inline(always)]
1546 pub fn validate_ascii(slice: &[u8]) -> Option<(u8, usize)> {
1547 let src = slice.as_ptr();
1548 let len = slice.len();
1549 let mut offset = 0usize;
1550 // Safety: if this check succeeds we're valid for reading at least `stride` elements.
1551 if SIMD_STRIDE_SIZE <= len {
1552 // First, process one unaligned vector
1553 // Safety: src is valid for a `SIMD_STRIDE_SIZE` read
1554 let simd = unsafe { load16_unaligned(src) };
1555 let mask = mask_ascii(simd);
1556 if mask != 0 {
1557 offset = mask.trailing_zeros() as usize;
1558 let non_ascii = unsafe { *src.add(offset) };
1559 return Some((non_ascii, offset));
1560 }
1561 offset = SIMD_STRIDE_SIZE;
1562 // Safety: Now that offset has changed we don't yet know how much it is valid for
1563
1564 // We have now seen 16 ASCII bytes. Let's guess that
1565 // there will be enough more to justify more expense
1566 // in the case of non-ASCII.
1567 // Use aligned reads for the sake of old microachitectures.
1568 // Safety: this correctly calculates the number of src_units that need to be read before the remaining list is aligned.
1569 // This is by definition less than SIMD_ALIGNMENT, which is defined to be equal to SIMD_STRIDE_SIZE.
1570 let until_alignment = unsafe { (SIMD_ALIGNMENT - ((src.add(offset) as usize) & SIMD_ALIGNMENT_MASK)) & SIMD_ALIGNMENT_MASK };
1571 // This addition won't overflow, because even in the 32-bit PAE case the
1572 // address space holds enough code that the slice length can't be that
1573 // close to address space size.
1574 // offset now equals SIMD_STRIDE_SIZE, hence times 3 below.
1575 //
1576 // Safety: if this check succeeds we're valid for reading at least `2 * SIMD_STRIDE_SIZE` elements plus `until_alignment`.
1577 // The extra SIMD_STRIDE_SIZE in the condition is because `offset` is already `SIMD_STRIDE_SIZE`.
1578 if until_alignment + (SIMD_STRIDE_SIZE * 3) <= len {
1579 if until_alignment != 0 {
1580 // Safety: this is safe to call since we're valid for this read (and more), and don't care about alignment
1581 // This will copy over bytes that get decoded twice since it's not incrementing `offset` by SIMD_STRIDE_SIZE. This is fine.
1582 let simd = unsafe { load16_unaligned(src.add(offset)) };
1583 let mask = mask_ascii(simd);
1584 if mask != 0 {
1585 offset += mask.trailing_zeros() as usize;
1586 let non_ascii = unsafe { *src.add(offset) };
1587 return Some((non_ascii, offset));
1588 }
1589 offset += until_alignment;
1590 }
1591 // Safety: At this point we're valid for reading 2*SIMD_STRIDE_SIZE elements
1592 // Safety: Now `offset` is aligned for `src`
1593 let len_minus_stride_times_two = len - (SIMD_STRIDE_SIZE * 2);
1594 loop {
1595 // Safety: We were valid for this read, and were aligned.
1596 let first = unsafe { load16_aligned(src.add(offset)) };
1597 let second = unsafe { load16_aligned(src.add(offset + SIMD_STRIDE_SIZE)) };
1598 if !simd_is_ascii(first | second) {
1599 // Safety: mask_ascii produces a mask of all the high bits.
1600 let mask_first = mask_ascii(first);
1601 if mask_first != 0 {
1602 // Safety: on little endian systems this will be the number of ascii bytes
1603 // before the first non-ascii, i.e. valid for indexing src
1604 // TODO SAFETY: What about big-endian systems?
1605 offset += mask_first.trailing_zeros() as usize;
1606 } else {
1607 let mask_second = mask_ascii(second);
1608 // Safety: on little endian systems this will be the number of ascii bytes
1609 // before the first non-ascii, i.e. valid for indexing src
1610 offset += SIMD_STRIDE_SIZE + mask_second.trailing_zeros() as usize;
1611 }
1612 // Safety: We know this is non-ASCII, and can uphold the safety-usable invariant here
1613 let non_ascii = unsafe { *src.add(offset) };
1614
1615 return Some((non_ascii, offset));
1616 }
1617 offset += SIMD_STRIDE_SIZE * 2;
1618 // Safety: This is `offset > len - 2 * SIMD_STRIDE_SIZE` which means we always have at least `2 * SIMD_STRIDE_SIZE` elements to munch next time.
1619 if offset > len_minus_stride_times_two {
1620 break;
1621 }
1622 }
1623 // Safety: if this check succeeds we're valid for reading at least `SIMD_STRIDE_SIZE`
1624 if offset + SIMD_STRIDE_SIZE <= len {
1625 // Safety: We were valid for this read, and were aligned.
1626 let simd = unsafe { load16_aligned(src.add(offset)) };
1627 // Safety: mask_ascii produces a mask of all the high bits.
1628 let mask = mask_ascii(simd);
1629 if mask != 0 {
1630 // Safety: on little endian systems this will be the number of ascii bytes
1631 // before the first non-ascii, i.e. valid for indexing src
1632 offset += mask.trailing_zeros() as usize;
1633 let non_ascii = unsafe { *src.add(offset) };
1634 // Safety: We know this is non-ASCII, and can uphold the safety-usable invariant here
1635 return Some((non_ascii, offset));
1636 }
1637 offset += SIMD_STRIDE_SIZE;
1638 }
1639 } else {
1640 // Safety: this is the unaligned branch
1641 // At most two iterations, so unroll
1642 // Safety: if this check succeeds we're valid for reading at least `SIMD_STRIDE_SIZE`
1643 if offset + SIMD_STRIDE_SIZE <= len {
1644 // Safety: We're valid for this read but must use an unaligned read
1645 let simd = unsafe { load16_unaligned(src.add(offset)) };
1646 let mask = mask_ascii(simd);
1647 if mask != 0 {
1648 offset += mask.trailing_zeros() as usize;
1649 let non_ascii = unsafe { *src.add(offset) };
1650 // Safety-usable invariant upheld here (same as above)
1651 return Some((non_ascii, offset));
1652 }
1653 offset += SIMD_STRIDE_SIZE;
1654 // Safety: if this check succeeds we're valid for reading at least `SIMD_STRIDE_SIZE`
1655 if offset + SIMD_STRIDE_SIZE <= len {
1656 // Safety: We're valid for this read but must use an unaligned read
1657 let simd = unsafe { load16_unaligned(src.add(offset)) };
1658 let mask = mask_ascii(simd);
1659 if mask != 0 {
1660 offset += mask.trailing_zeros() as usize;
1661 let non_ascii = unsafe { *src.add(offset) };
1662 // Safety-usable invariant upheld here (same as above)
1663 return Some((non_ascii, offset));
1664 }
1665 offset += SIMD_STRIDE_SIZE;
1666 }
1667 }
1668 }
1669 }
1670 while offset < len {
1671 // Safety: relies straightforwardly on the `len` invariant
1672 let code_unit = unsafe { *(src.add(offset)) };
1673 if code_unit > 127 {
1674 // Safety-usable invariant upheld here
1675 return Some((code_unit, offset));
1676 }
1677 offset += 1;
1678 }
1679 None
1680 }
1681 } else {
1682 // Safety-usable invariant: returns byte index of first non-ascii byte
1683 #[inline(always)]
1684 fn find_non_ascii(word: usize, second_word: usize) -> Option<usize> {
1685 let word_masked = word & ASCII_MASK;
1686 let second_masked = second_word & ASCII_MASK;
1687 if (word_masked | second_masked) == 0 {
1688 // Both are ascii, invariant upheld
1689 return None;
1690 }
1691 if word_masked != 0 {
1692 let zeros = count_zeros(word_masked);
1693 // `zeros` now contains 0 to 7 (for the seven bits of masked ASCII in little endian,
1694 // or up to 7 bits of non-ASCII in big endian if the first byte is non-ASCII)
1695 // plus 8 times the number of ASCII in text order before the
1696 // non-ASCII byte in the little-endian case or 8 times the number of ASCII in
1697 // text order before the non-ASCII byte in the big-endian case.
1698 let num_ascii = (zeros >> 3) as usize;
1699 // Safety-usable invariant upheld here
1700 return Some(num_ascii);
1701 }
1702 let zeros = count_zeros(second_masked);
1703 // `zeros` now contains 0 to 7 (for the seven bits of masked ASCII in little endian,
1704 // or up to 7 bits of non-ASCII in big endian if the first byte is non-ASCII)
1705 // plus 8 times the number of ASCII in text order before the
1706 // non-ASCII byte in the little-endian case or 8 times the number of ASCII in
1707 // text order before the non-ASCII byte in the big-endian case.
1708 let num_ascii = (zeros >> 3) as usize;
1709 // Safety-usable invariant upheld here
1710 Some(ALU_ALIGNMENT + num_ascii)
1711 }
1712
1713 /// Safety: `src` must be valid for the reads of two `usize`s
1714 ///
1715 /// Safety-usable invariant: will return byte index of first non-ascii byte
1716 #[inline(always)]
1717 unsafe fn validate_ascii_stride(src: *const usize) -> Option<usize> {
1718 let word = *src;
1719 let second_word = *(src.add(1));
1720 find_non_ascii(word, second_word)
1721 }
1722
1723 /// Safety-usable invariant: will return Some() when it encounters non-ASCII, with the first element in the Some being
1724 /// guaranteed to be non-ASCII (> 127), and the second being the offset where it is found
1725 #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
1726 #[inline(always)]
1727 pub fn validate_ascii(slice: &[u8]) -> Option<(u8, usize)> {
1728 let src = slice.as_ptr();
1729 let len = slice.len();
1730 let mut offset = 0usize;
1731 let mut until_alignment = (ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) & ALU_ALIGNMENT_MASK;
1732 // Safety: If this check fails we're valid to read `until_alignment + ALU_STRIDE_SIZE` elements
1733 if until_alignment + ALU_STRIDE_SIZE <= len {
1734 while until_alignment != 0 {
1735 let code_unit = slice[offset];
1736 if code_unit > 127 {
1737 // Safety-usable invairant upheld here
1738 return Some((code_unit, offset));
1739 }
1740 offset += 1;
1741 until_alignment -= 1;
1742 }
1743 // Safety: At this point we have read until_alignment elements and
1744 // are valid for `ALU_STRIDE_SIZE` more.
1745 let len_minus_stride = len - ALU_STRIDE_SIZE;
1746 loop {
1747 // Safety: we were valid for this read
1748 let ptr = unsafe { src.add(offset) as *const usize };
1749 if let Some(num_ascii) = unsafe { validate_ascii_stride(ptr) } {
1750 offset += num_ascii;
1751 // Safety-usable invairant upheld here using the invariant from validate_ascii_stride()
1752 return Some((unsafe { *(src.add(offset)) }, offset));
1753 }
1754 offset += ALU_STRIDE_SIZE;
1755 // Safety: This is `offset > ALU_STRIDE_SIZE` which means we always have at least `2 * ALU_STRIDE_SIZE` elements to munch next time.
1756 if offset > len_minus_stride {
1757 break;
1758 }
1759 }
1760 }
1761 while offset < len {
1762 let code_unit = slice[offset];
1763 if code_unit > 127 {
1764 // Safety-usable invairant upheld here
1765 return Some((code_unit, offset));
1766 }
1767 offset += 1;
1768 }
1769 None
1770 }
1771
1772 }
1773}
1774
1775cfg_if! {
1776 if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"))))] {
1777
1778 } else if #[cfg(all(feature = "simd-accel", target_endian = "little", target_feature = "neon"))] {
1779 // Even with NEON enabled, we use the ALU path for ASCII validation, because testing
1780 // on Exynos 5 indicated that using NEON isn't worthwhile where there are only
1781 // vector reads without vector writes.
1782
1783 pub const ALU_STRIDE_SIZE: usize = 8;
1784
1785 pub const ALU_ALIGNMENT: usize = 4;
1786
1787 pub const ALU_ALIGNMENT_MASK: usize = 3;
1788 } else {
1789 // Safety: src points to two valid `usize`s, dst points to four valid `usize`s
1790 #[inline(always)]
1791 unsafe fn unpack_latin1_stride_alu(src: *const usize, dst: *mut usize) {
1792 // Safety: src safety invariant used here
1793 let word = *src;
1794 let second_word = *(src.add(1));
1795 // Safety: dst safety invariant passed down
1796 unpack_alu(word, second_word, dst);
1797 }
1798
1799 // Safety: src points to four valid `usize`s, dst points to two valid `usize`s
1800 #[inline(always)]
1801 unsafe fn pack_latin1_stride_alu(src: *const usize, dst: *mut usize) {
1802 // Safety: src safety invariant used here
1803 let first = *src;
1804 let second = *(src.add(1));
1805 let third = *(src.add(2));
1806 let fourth = *(src.add(3));
1807 // Safety: dst safety invariant passed down
1808 pack_alu(first, second, third, fourth, dst);
1809 }
1810
1811 // Safety: src points to two valid `usize`s, dst points to four valid `usize`s
1812 #[inline(always)]
1813 unsafe fn ascii_to_basic_latin_stride_alu(src: *const usize, dst: *mut usize) -> bool {
1814 // Safety: src safety invariant used here
1815 let word = *src;
1816 let second_word = *(src.add(1));
1817 // Check if the words contains non-ASCII
1818 if (word & ASCII_MASK) | (second_word & ASCII_MASK) != 0 {
1819 return false;
1820 }
1821 // Safety: dst safety invariant passed down
1822 unpack_alu(word, second_word, dst);
1823 true
1824 }
1825
1826 // Safety: src points four valid `usize`s, dst points to two valid `usize`s
1827 #[inline(always)]
1828 unsafe fn basic_latin_to_ascii_stride_alu(src: *const usize, dst: *mut usize) -> bool {
1829 // Safety: src safety invariant used here
1830 let first = *src;
1831 let second = *(src.add(1));
1832 let third = *(src.add(2));
1833 let fourth = *(src.add(3));
1834 if (first & BASIC_LATIN_MASK) | (second & BASIC_LATIN_MASK) | (third & BASIC_LATIN_MASK) | (fourth & BASIC_LATIN_MASK) != 0 {
1835 return false;
1836 }
1837 // Safety: dst safety invariant passed down
1838 pack_alu(first, second, third, fourth, dst);
1839 true
1840 }
1841
1842 // Safety: src, dst both point to two valid `usize`s each
1843 // Safety-usable invariant: Will return byte index of first non-ascii byte.
1844 #[inline(always)]
1845 unsafe fn ascii_to_ascii_stride(src: *const usize, dst: *mut usize) -> Option<usize> {
1846 // Safety: src safety invariant used here
1847 let word = *src;
1848 let second_word = *(src.add(1));
1849 // Safety: src safety invariant used here
1850 *dst = word;
1851 *(dst.add(1)) = second_word;
1852 // Relies on safety-usable invariant here
1853 find_non_ascii(word, second_word)
1854 }
1855
1856 basic_latin_alu!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_alu);
1857 basic_latin_alu!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_alu);
1858 latin1_alu!(unpack_latin1, u8, u16, unpack_latin1_stride_alu);
1859 latin1_alu!(pack_latin1, u16, u8, pack_latin1_stride_alu);
1860 // Safety invariant upheld: ascii_to_ascii_stride will return byte index of first non-ascii if found
1861 ascii_alu!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride);
1862 }
1863}
1864
1865pub fn ascii_valid_up_to(bytes: &[u8]) -> usize {
1866 match validate_ascii(bytes) {
1867 None => bytes.len(),
1868 Some((_, num_valid)) => num_valid,
1869 }
1870}
1871
1872pub fn iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize {
1873 for (i, b_ref) in bytes.iter().enumerate() {
1874 let b = *b_ref;
1875 if b >= 0x80 || b == 0x1B || b == 0x0E || b == 0x0F {
1876 return i;
1877 }
1878 }
1879 bytes.len()
1880}
1881
1882// Any copyright to the test code below this comment is dedicated to the
1883// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
1884
1885#[cfg(all(test, feature = "alloc"))]
1886mod tests {
1887 use super::*;
1888 use alloc::vec::Vec;
1889
1890 macro_rules! test_ascii {
1891 ($test_name:ident, $fn_tested:ident, $src_unit:ty, $dst_unit:ty) => {
1892 #[test]
1893 fn $test_name() {
1894 let mut src: Vec<$src_unit> = Vec::with_capacity(32);
1895 let mut dst: Vec<$dst_unit> = Vec::with_capacity(32);
1896 for i in 0..32 {
1897 src.clear();
1898 dst.clear();
1899 dst.resize(32, 0);
1900 for j in 0..32 {
1901 let c = if i == j { 0xAA } else { j + 0x40 };
1902 src.push(c as $src_unit);
1903 }
1904 match unsafe { $fn_tested(src.as_ptr(), dst.as_mut_ptr(), 32) } {
1905 None => unreachable!("Should always find non-ASCII"),
1906 Some((non_ascii, num_ascii)) => {
1907 assert_eq!(non_ascii, 0xAA);
1908 assert_eq!(num_ascii, i);
1909 for j in 0..i {
1910 assert_eq!(dst[j], (j + 0x40) as $dst_unit);
1911 }
1912 }
1913 }
1914 }
1915 }
1916 };
1917 }
1918
1919 test_ascii!(test_ascii_to_ascii, ascii_to_ascii, u8, u8);
1920 test_ascii!(test_ascii_to_basic_latin, ascii_to_basic_latin, u8, u16);
1921 test_ascii!(test_basic_latin_to_ascii, basic_latin_to_ascii, u16, u8);
1922}