encoding_rs/
ascii.rs

1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10// It's assumed that in due course Rust will have explicit SIMD but will not
11// be good at run-time selection of SIMD vs. no-SIMD. In such a future,
12// x86_64 will always use SSE2 and 32-bit x86 will use SSE2 when compiled with
13// a Mozilla-shipped rustc. SIMD support and especially detection on ARM is a
14// mess. Under the circumstances, it seems to make sense to optimize the ALU
15// case for ARMv7 rather than x86. Annoyingly, I was unable to get useful
16// numbers of the actual ARMv7 CPU I have access to, because (thermal?)
17// throttling kept interfering. Since Raspberry Pi 3 (ARMv8 core but running
18// ARMv7 code) produced reproducible performance numbers, that's the ARM
19// computer that this code ended up being optimized for in the ALU case.
20// Less popular CPU architectures simply get the approach that was chosen based
21// on Raspberry Pi 3 measurements. The UTF-16 and UTF-8 ALU cases take
22// different approaches based on benchmarking on Raspberry Pi 3.
23
24#[cfg(all(
25    feature = "simd-accel",
26    any(
27        target_feature = "sse2",
28        all(target_endian = "little", target_arch = "aarch64"),
29        all(target_endian = "little", target_feature = "neon")
30    )
31))]
32use crate::simd_funcs::*;
33
34cfg_if! {
35    if #[cfg(feature = "simd-accel")] {
36        #[allow(unused_imports)]
37        use ::core::intrinsics::unlikely;
38        #[allow(unused_imports)]
39        use ::core::intrinsics::likely;
40    } else {
41        #[allow(dead_code)]
42        #[inline(always)]
43        fn unlikely(b: bool) -> bool {
44            b
45        }
46        #[allow(dead_code)]
47        #[inline(always)]
48        fn likely(b: bool) -> bool {
49            b
50        }
51    }
52}
53
54// Safety invariants for masks: data & mask = 0 for valid ASCII or basic latin utf-16
55
56// `as` truncates, so works on 32-bit, too.
57#[allow(dead_code)]
58pub const ASCII_MASK: usize = 0x8080_8080_8080_8080u64 as usize;
59
60// `as` truncates, so works on 32-bit, too.
61#[allow(dead_code)]
62pub const BASIC_LATIN_MASK: usize = 0xFF80_FF80_FF80_FF80u64 as usize;
63
64#[allow(unused_macros)]
65macro_rules! ascii_naive {
66    ($name:ident, $src_unit:ty, $dst_unit:ty) => {
67        /// Safety: src and dst must have len_unit elements and be aligned
68        /// Safety-usable invariant: will return Some() when it fails
69        /// to convert. The first value will be a u8 that is > 127.
70        #[inline(always)]
71        pub unsafe fn $name(
72            src: *const $src_unit,
73            dst: *mut $dst_unit,
74            len: usize,
75        ) -> Option<($src_unit, usize)> {
76            // Yes, manually omitting the bound check here matters
77            // a lot for perf.
78            for i in 0..len {
79                // Safety: len invariant used here
80                let code_unit = *(src.add(i));
81                // Safety: Upholds safety-usable invariant here
82                if code_unit > 127 {
83                    return Some((code_unit, i));
84                }
85                // Safety: len invariant used here
86                *(dst.add(i)) = code_unit as $dst_unit;
87            }
88            return None;
89        }
90    };
91}
92
93#[allow(unused_macros)]
94macro_rules! ascii_alu {
95    ($name:ident,
96     // safety invariant: src/dst MUST be u8
97     $src_unit:ty,
98     $dst_unit:ty,
99     // Safety invariant: stride_fn must consume and produce two usizes, and return the index of the first non-ascii when it fails
100     $stride_fn:ident) => {
101        /// Safety: src and dst must have len elements, src is valid for read, dst is valid for
102        /// write
103        /// Safety-usable invariant: will return Some() when it fails
104        /// to convert. The first value will be a u8 that is > 127.
105        #[cfg_attr(feature = "cargo-clippy", allow(never_loop, cast_ptr_alignment))]
106        #[inline(always)]
107        pub unsafe fn $name(
108            src: *const $src_unit,
109            dst: *mut $dst_unit,
110            len: usize,
111        ) -> Option<($src_unit, usize)> {
112            let mut offset = 0usize;
113            // This loop is only broken out of as a `goto` forward
114            loop {
115                // Safety: until_alignment becomes the number of bytes we need to munch until we are aligned to usize
116                let mut until_alignment = {
117                    // Check if the other unit aligns if we move the narrower unit
118                    // to alignment.
119                    //               if ::core::mem::size_of::<$src_unit>() == ::core::mem::size_of::<$dst_unit>() {
120                    // ascii_to_ascii
121                    let src_alignment = (src as usize) & ALU_ALIGNMENT_MASK;
122                    let dst_alignment = (dst as usize) & ALU_ALIGNMENT_MASK;
123                    if src_alignment != dst_alignment {
124                        // Safety: bails early and ends up in the naïve branch where usize-alignment doesn't matter
125                        break;
126                    }
127                    (ALU_ALIGNMENT - src_alignment) & ALU_ALIGNMENT_MASK
128                    //               } else if ::core::mem::size_of::<$src_unit>() < ::core::mem::size_of::<$dst_unit>() {
129                    // ascii_to_basic_latin
130                    //                   let src_until_alignment = (ALIGNMENT - ((src as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK;
131                    //                   if (dst.add(src_until_alignment) as usize) & ALIGNMENT_MASK != 0 {
132                    //                       break;
133                    //                   }
134                    //                   src_until_alignment
135                    //               } else {
136                    // basic_latin_to_ascii
137                    //                   let dst_until_alignment = (ALIGNMENT - ((dst as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK;
138                    //                   if (src.add(dst_until_alignment) as usize) & ALIGNMENT_MASK != 0 {
139                    //                       break;
140                    //                   }
141                    //                   dst_until_alignment
142                    //               }
143                };
144                if until_alignment + ALU_STRIDE_SIZE <= len {
145                    // Moving pointers to alignment seems to be a pessimization on
146                    // x86_64 for operations that have UTF-16 as the internal
147                    // Unicode representation. However, since it seems to be a win
148                    // on ARM (tested ARMv7 code running on ARMv8 [rpi3]), except
149                    // mixed results when encoding from UTF-16 and since x86 and
150                    // x86_64 should be using SSE2 in due course, keeping the move
151                    // to alignment here. It would be good to test on more ARM CPUs
152                    // and on real MIPS and POWER hardware.
153                    //
154                    // Safety: This is the naïve code once again, for `until_alignment` bytes
155                    while until_alignment != 0 {
156                        let code_unit = *(src.add(offset));
157                        if code_unit > 127 {
158                            // Safety: Upholds safety-usable invariant here
159                            return Some((code_unit, offset));
160                        }
161                        *(dst.add(offset)) = code_unit as $dst_unit;
162                        // Safety: offset is the number of bytes copied so far
163                        offset += 1;
164                        until_alignment -= 1;
165                    }
166                    let len_minus_stride = len - ALU_STRIDE_SIZE;
167                    loop {
168                        // Safety: num_ascii is known to be a byte index of a non-ascii byte due to stride_fn's invariant
169                        if let Some(num_ascii) = $stride_fn(
170                            // Safety: These are known to be valid and aligned since we have at
171                            // least ALU_STRIDE_SIZE data in these buffers, and offset is the
172                            // number of elements copied so far, which according to the
173                            // until_alignment calculation above will cause both src and dst to be
174                            // aligned to usize after this add
175                            src.add(offset) as *const usize,
176                            dst.add(offset) as *mut usize,
177                        ) {
178                            offset += num_ascii;
179                            // Safety: Upholds safety-usable invariant here by indexing into non-ascii byte
180                            return Some((*(src.add(offset)), offset));
181                        }
182                        // Safety: offset continues to be the number of bytes copied so far, and
183                        // maintains usize alignment for the next loop iteration
184                        offset += ALU_STRIDE_SIZE;
185                        // Safety: This is `offset > len - stride. This loop will continue as long as
186                        // `offset <= len - stride`, which means there are `stride` bytes to still be read.
187                        if offset > len_minus_stride {
188                            break;
189                        }
190                    }
191                }
192                break;
193            }
194
195            // Safety: This is the naïve code, same as ascii_naive, and has no requirements
196            // other than src/dst being valid for the the right lens
197            while offset < len {
198                // Safety: len invariant used here
199                let code_unit = *(src.add(offset));
200                if code_unit > 127 {
201                    // Safety: Upholds safety-usable invariant here
202                    return Some((code_unit, offset));
203                }
204                // Safety: len invariant used here
205                *(dst.add(offset)) = code_unit as $dst_unit;
206                offset += 1;
207            }
208            None
209        }
210    };
211}
212
213#[allow(unused_macros)]
214macro_rules! basic_latin_alu {
215    ($name:ident,
216    // safety invariant: use u8 for src/dest for ascii, and u16 for basic_latin
217     $src_unit:ty,
218     $dst_unit:ty,
219    // safety invariant: stride function must munch ALU_STRIDE_SIZE*size(src_unit) bytes off of src and
220    // write ALU_STRIDE_SIZE*size(dst_unit) bytes to dst
221     $stride_fn:ident) => {
222        /// Safety: src and dst must have len elements, src is valid for read, dst is valid for
223        /// write
224        /// Safety-usable invariant: will return Some() when it fails
225        /// to convert. The first value will be a u8 that is > 127.
226        #[cfg_attr(
227            feature = "cargo-clippy",
228            allow(never_loop, cast_ptr_alignment, cast_lossless)
229        )]
230        #[inline(always)]
231        pub unsafe fn $name(
232            src: *const $src_unit,
233            dst: *mut $dst_unit,
234            len: usize,
235        ) -> Option<($src_unit, usize)> {
236            let mut offset = 0usize;
237            // This loop is only broken out of as a `goto` forward
238            loop {
239                // Safety: until_alignment becomes the number of bytes we need to munch from src/dest until we are aligned to usize
240                // We ensure basic-latin has the same alignment as ascii, starting with ascii since it is smaller.
241                let mut until_alignment = {
242                    // Check if the other unit aligns if we move the narrower unit
243                    // to alignment.
244                    //               if ::core::mem::size_of::<$src_unit>() == ::core::mem::size_of::<$dst_unit>() {
245                    // ascii_to_ascii
246                    //                   let src_alignment = (src as usize) & ALIGNMENT_MASK;
247                    //                   let dst_alignment = (dst as usize) & ALIGNMENT_MASK;
248                    //                   if src_alignment != dst_alignment {
249                    //                       break;
250                    //                   }
251                    //                   (ALIGNMENT - src_alignment) & ALIGNMENT_MASK
252                    //               } else
253                    if ::core::mem::size_of::<$src_unit>() < ::core::mem::size_of::<$dst_unit>() {
254                        // ascii_to_basic_latin
255                        let src_until_alignment = (ALU_ALIGNMENT
256                            - ((src as usize) & ALU_ALIGNMENT_MASK))
257                            & ALU_ALIGNMENT_MASK;
258                        if (dst.wrapping_add(src_until_alignment) as usize) & ALU_ALIGNMENT_MASK
259                            != 0
260                        {
261                            break;
262                        }
263                        src_until_alignment
264                    } else {
265                        // basic_latin_to_ascii
266                        let dst_until_alignment = (ALU_ALIGNMENT
267                            - ((dst as usize) & ALU_ALIGNMENT_MASK))
268                            & ALU_ALIGNMENT_MASK;
269                        if (src.wrapping_add(dst_until_alignment) as usize) & ALU_ALIGNMENT_MASK
270                            != 0
271                        {
272                            break;
273                        }
274                        dst_until_alignment
275                    }
276                };
277                if until_alignment + ALU_STRIDE_SIZE <= len {
278                    // Moving pointers to alignment seems to be a pessimization on
279                    // x86_64 for operations that have UTF-16 as the internal
280                    // Unicode representation. However, since it seems to be a win
281                    // on ARM (tested ARMv7 code running on ARMv8 [rpi3]), except
282                    // mixed results when encoding from UTF-16 and since x86 and
283                    // x86_64 should be using SSE2 in due course, keeping the move
284                    // to alignment here. It would be good to test on more ARM CPUs
285                    // and on real MIPS and POWER hardware.
286                    //
287                    // Safety: This is the naïve code once again, for `until_alignment` bytes
288                    while until_alignment != 0 {
289                        let code_unit = *(src.add(offset));
290                        if code_unit > 127 {
291                            // Safety: Upholds safety-usable invariant here
292                            return Some((code_unit, offset));
293                        }
294                        *(dst.add(offset)) = code_unit as $dst_unit;
295                        // Safety: offset is the number of bytes copied so far
296                        offset += 1;
297                        until_alignment -= 1;
298                    }
299                    let len_minus_stride = len - ALU_STRIDE_SIZE;
300                    loop {
301                        if !$stride_fn(
302                            // Safety: These are known to be valid and aligned since we have at
303                            // least ALU_STRIDE_SIZE data in these buffers, and offset is the
304                            // number of elements copied so far, which according to the
305                            // until_alignment calculation above will cause both src and dst to be
306                            // aligned to usize after this add
307                            src.add(offset) as *const usize,
308                            dst.add(offset) as *mut usize,
309                        ) {
310                            break;
311                        }
312                        // Safety: offset continues to be the number of bytes copied so far, and
313                        // maintains usize alignment for the next loop iteration
314                        offset += ALU_STRIDE_SIZE;
315                        // Safety: This is `offset > len - stride. This loop will continue as long as
316                        // `offset <= len - stride`, which means there are `stride` bytes to still be read.
317                        if offset > len_minus_stride {
318                            break;
319                        }
320                    }
321                }
322                break;
323            }
324            // Safety: This is the naïve code once again, for leftover bytes
325            while offset < len {
326                // Safety: len invariant used here
327                let code_unit = *(src.add(offset));
328                if code_unit > 127 {
329                    // Safety: Upholds safety-usable invariant here
330                    return Some((code_unit, offset));
331                }
332                // Safety: len invariant used here
333                *(dst.add(offset)) = code_unit as $dst_unit;
334                offset += 1;
335            }
336            None
337        }
338    };
339}
340
341#[allow(unused_macros)]
342macro_rules! latin1_alu {
343    // safety invariant: stride function must munch ALU_STRIDE_SIZE*size(src_unit) bytes off of src and
344    // write ALU_STRIDE_SIZE*size(dst_unit) bytes to dst
345    ($name:ident, $src_unit:ty, $dst_unit:ty, $stride_fn:ident) => {
346        /// Safety: src and dst must have len elements, src is valid for read, dst is valid for
347        /// write
348        #[cfg_attr(
349            feature = "cargo-clippy",
350            allow(never_loop, cast_ptr_alignment, cast_lossless)
351        )]
352        #[inline(always)]
353        pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
354            let mut offset = 0usize;
355            // This loop is only broken out of as a `goto` forward
356            loop {
357                // Safety: until_alignment becomes the number of bytes we need to munch from src/dest until we are aligned to usize
358                // We ensure the UTF-16 side has the same alignment as the Latin-1 side, starting with Latin-1 since it is smaller.
359                let mut until_alignment = {
360                    if ::core::mem::size_of::<$src_unit>() < ::core::mem::size_of::<$dst_unit>() {
361                        // unpack
362                        let src_until_alignment = (ALU_ALIGNMENT
363                            - ((src as usize) & ALU_ALIGNMENT_MASK))
364                            & ALU_ALIGNMENT_MASK;
365                        if (dst.wrapping_add(src_until_alignment) as usize) & ALU_ALIGNMENT_MASK
366                            != 0
367                        {
368                            break;
369                        }
370                        src_until_alignment
371                    } else {
372                        // pack
373                        let dst_until_alignment = (ALU_ALIGNMENT
374                            - ((dst as usize) & ALU_ALIGNMENT_MASK))
375                            & ALU_ALIGNMENT_MASK;
376                        if (src.wrapping_add(dst_until_alignment) as usize) & ALU_ALIGNMENT_MASK
377                            != 0
378                        {
379                            break;
380                        }
381                        dst_until_alignment
382                    }
383                };
384                if until_alignment + ALU_STRIDE_SIZE <= len {
385                    // Safety: This is the naïve code once again, for `until_alignment` bytes
386                    while until_alignment != 0 {
387                        let code_unit = *(src.add(offset));
388                        *(dst.add(offset)) = code_unit as $dst_unit;
389                        // Safety: offset is the number of bytes copied so far
390                        offset += 1;
391                        until_alignment -= 1;
392                    }
393                    let len_minus_stride = len - ALU_STRIDE_SIZE;
394                    loop {
395                        $stride_fn(
396                            // Safety: These are known to be valid and aligned since we have at
397                            // least ALU_STRIDE_SIZE data in these buffers, and offset is the
398                            // number of elements copied so far, which according to the
399                            // until_alignment calculation above will cause both src and dst to be
400                            // aligned to usize after this add
401                            src.add(offset) as *const usize,
402                            dst.add(offset) as *mut usize,
403                        );
404                        // Safety: offset continues to be the number of bytes copied so far, and
405                        // maintains usize alignment for the next loop iteration
406                        offset += ALU_STRIDE_SIZE;
407                        // Safety: This is `offset > len - stride. This loop will continue as long as
408                        // `offset <= len - stride`, which means there are `stride` bytes to still be read.
409                        if offset > len_minus_stride {
410                            break;
411                        }
412                    }
413                }
414                break;
415            }
416            // Safety: This is the naïve code once again, for leftover bytes
417            while offset < len {
418                // Safety: len invariant used here
419                let code_unit = *(src.add(offset));
420                *(dst.add(offset)) = code_unit as $dst_unit;
421                offset += 1;
422            }
423        }
424    };
425}
426
427#[allow(unused_macros)]
428macro_rules! ascii_simd_check_align {
429    (
430        $name:ident,
431        $src_unit:ty,
432        $dst_unit:ty,
433        // Safety: This function must require aligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
434        $stride_both_aligned:ident,
435        // Safety: This function must require aligned/unaligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
436        $stride_src_aligned:ident,
437        // Safety: This function must require unaligned/aligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
438        $stride_dst_aligned:ident,
439        // Safety: This function must require unaligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
440        $stride_neither_aligned:ident
441    ) => {
442        /// Safety: src/dst must be valid for reads/writes of `len` elements of their units.
443        ///
444        /// Safety-usable invariant: will return Some() when it encounters non-ASCII, with the first element in the Some being
445        /// guaranteed to be non-ASCII (> 127), and the second being the offset where it is found
446        #[inline(always)]
447        pub unsafe fn $name(
448            src: *const $src_unit,
449            dst: *mut $dst_unit,
450            len: usize,
451        ) -> Option<($src_unit, usize)> {
452            let mut offset = 0usize;
453            // Safety: if this check succeeds we're valid for reading/writing at least `SIMD_STRIDE_SIZE` elements.
454            if SIMD_STRIDE_SIZE <= len {
455                let len_minus_stride = len - SIMD_STRIDE_SIZE;
456                // XXX Should we first process one stride unconditionally as unaligned to
457                // avoid the cost of the branchiness below if the first stride fails anyway?
458                // XXX Should we just use unaligned SSE2 access unconditionally? It seems that
459                // on Haswell, it would make sense to just use unaligned and not bother
460                // checking. Need to benchmark older architectures before deciding.
461                let dst_masked = (dst as usize) & SIMD_ALIGNMENT_MASK;
462                // Safety: checking whether src is aligned
463                if ((src as usize) & SIMD_ALIGNMENT_MASK) == 0 {
464                    // Safety: Checking whether dst is aligned
465                    if dst_masked == 0 {
466                        loop {
467                            // Safety: We're valid to read/write SIMD_STRIDE_SIZE elements and have the appropriate alignments
468                            if !$stride_both_aligned(src.add(offset), dst.add(offset)) {
469                                break;
470                            }
471                            offset += SIMD_STRIDE_SIZE;
472                            // Safety: This is `offset > len - SIMD_STRIDE_SIZE` which means we always have at least `SIMD_STRIDE_SIZE` elements to munch next time.
473                            if offset > len_minus_stride {
474                                break;
475                            }
476                        }
477                    } else {
478                        loop {
479                            // Safety: We're valid to read/write SIMD_STRIDE_SIZE elements and have the appropriate alignments
480                            if !$stride_src_aligned(src.add(offset), dst.add(offset)) {
481                                break;
482                            }
483                            offset += SIMD_STRIDE_SIZE;
484                            // Safety: This is `offset > len - SIMD_STRIDE_SIZE` which means we always have at least `SIMD_STRIDE_SIZE` elements to munch next time.
485                            if offset > len_minus_stride {
486                                break;
487                            }
488                        }
489                    }
490                } else {
491                    if dst_masked == 0 {
492                        loop {
493                            // Safety: We're valid to read/write SIMD_STRIDE_SIZE elements and have the appropriate alignments
494                            if !$stride_dst_aligned(src.add(offset), dst.add(offset)) {
495                                break;
496                            }
497                            offset += SIMD_STRIDE_SIZE;
498                            // Safety: This is `offset > len - SIMD_STRIDE_SIZE` which means we always have at least `SIMD_STRIDE_SIZE` elements to munch next time.
499                            if offset > len_minus_stride {
500                                break;
501                            }
502                        }
503                    } else {
504                        loop {
505                            // Safety: We're valid to read/write SIMD_STRIDE_SIZE elements and have the appropriate alignments
506                            if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
507                                break;
508                            }
509                            offset += SIMD_STRIDE_SIZE;
510                            // Safety: This is `offset > len - SIMD_STRIDE_SIZE` which means we always have at least `SIMD_STRIDE_SIZE` elements to munch next time.
511                            if offset > len_minus_stride {
512                                break;
513                            }
514                        }
515                    }
516                }
517            }
518            while offset < len {
519                // Safety: uses len invariant here and below
520                let code_unit = *(src.add(offset));
521                if code_unit > 127 {
522                    // Safety: upholds safety-usable invariant
523                    return Some((code_unit, offset));
524                }
525                *(dst.add(offset)) = code_unit as $dst_unit;
526                offset += 1;
527            }
528            None
529        }
530    };
531}
532
533#[allow(unused_macros)]
534macro_rules! ascii_simd_check_align_unrolled {
535    (
536        $name:ident,
537        $src_unit:ty,
538        $dst_unit:ty,
539        // Safety: This function must require aligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
540        $stride_both_aligned:ident,
541        // Safety: This function must require aligned/unaligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
542        $stride_src_aligned:ident,
543        // Safety: This function must require unaligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
544        $stride_neither_aligned:ident,
545        // Safety: This function must require aligned src/dest that are valid for reading/writing 2*SIMD_STRIDE_SIZE src_unit/dst_unit
546        $double_stride_both_aligned:ident,
547        // Safety: This function must require aligned/unaligned src/dest that are valid for reading/writing 2*SIMD_STRIDE_SIZE src_unit/dst_unit
548        $double_stride_src_aligned:ident
549    ) => {
550        /// Safety: src/dst must be valid for reads/writes of `len` elements of their units.
551        ///
552        /// Safety-usable invariant: will return Some() when it encounters non-ASCII, with the first element in the Some being
553        /// guaranteed to be non-ASCII (> 127), and the second being the offset where it is found        #[inline(always)]
554        pub unsafe fn $name(
555            src: *const $src_unit,
556            dst: *mut $dst_unit,
557            len: usize,
558        ) -> Option<($src_unit, usize)> {
559            let unit_size = ::core::mem::size_of::<$src_unit>();
560            let mut offset = 0usize;
561            // This loop is only broken out of as a goto forward without
562            // actually looping
563            'outer: loop {
564                // Safety: if this check succeeds we're valid for reading/writing at least `SIMD_STRIDE_SIZE` elements.
565                if SIMD_STRIDE_SIZE <= len {
566                    // First, process one unaligned
567                    // Safety: this is safe to call since we're valid for this read/write
568                    if !$stride_neither_aligned(src, dst) {
569                        break 'outer;
570                    }
571                    offset = SIMD_STRIDE_SIZE;
572
573                    // We have now seen 16 ASCII bytes. Let's guess that
574                    // there will be enough more to justify more expense
575                    // in the case of non-ASCII.
576                    // Use aligned reads for the sake of old microachitectures.
577                    //
578                    // Safety: this correctly calculates the number of src_units that need to be read before the remaining list is aligned.
579                    // This is less that SIMD_ALIGNMENT, which is also SIMD_STRIDE_SIZE (as documented)
580                    let until_alignment = ((SIMD_ALIGNMENT
581                        - ((src.add(offset) as usize) & SIMD_ALIGNMENT_MASK))
582                        & SIMD_ALIGNMENT_MASK)
583                        / unit_size;
584                    // Safety: This addition won't overflow, because even in the 32-bit PAE case the
585                    // address space holds enough code that the slice length can't be that
586                    // close to address space size.
587                    // offset now equals SIMD_STRIDE_SIZE, hence times 3 below.
588                    //
589                    // Safety: if this check succeeds we're valid for reading/writing at least `2 * SIMD_STRIDE_SIZE` elements plus `until_alignment`.
590                    // The extra SIMD_STRIDE_SIZE in the condition is because `offset` is already `SIMD_STRIDE_SIZE`.
591                    if until_alignment + (SIMD_STRIDE_SIZE * 3) <= len {
592                        if until_alignment != 0 {
593                            // Safety: this is safe to call since we're valid for this read/write (and more), and don't care about alignment
594                            // This will copy over bytes that get decoded twice since it's not incrementing `offset` by SIMD_STRIDE_SIZE. This is fine.
595                            if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
596                                break;
597                            }
598                            offset += until_alignment;
599                        }
600                        // Safety: At this point we're valid for reading/writing 2*SIMD_STRIDE_SIZE elements
601                        // Safety: Now `offset` is aligned for `src`
602                        let len_minus_stride_times_two = len - (SIMD_STRIDE_SIZE * 2);
603                        // Safety: This is whether dst is aligned
604                        let dst_masked = (dst.add(offset) as usize) & SIMD_ALIGNMENT_MASK;
605                        if dst_masked == 0 {
606                            loop {
607                                // Safety: both are aligned, we can call the aligned function. We're valid for reading/writing double stride from the initial condition
608                                // and the loop break condition below
609                                if let Some(advance) =
610                                    $double_stride_both_aligned(src.add(offset), dst.add(offset))
611                                {
612                                    offset += advance;
613                                    let code_unit = *(src.add(offset));
614                                    // Safety: uses safety-usable invariant on ascii_to_ascii_simd_double_stride to return
615                                    // guaranteed non-ascii
616                                    return Some((code_unit, offset));
617                                }
618                                offset += SIMD_STRIDE_SIZE * 2;
619                                // Safety: This is `offset > len - 2 * SIMD_STRIDE_SIZE` which means we always have at least `2 * SIMD_STRIDE_SIZE` elements to munch next time.
620                                if offset > len_minus_stride_times_two {
621                                    break;
622                                }
623                            }
624                            // Safety: We're valid for reading/writing one more, and can still assume alignment
625                            if offset + SIMD_STRIDE_SIZE <= len {
626                                if !$stride_both_aligned(src.add(offset), dst.add(offset)) {
627                                    break 'outer;
628                                }
629                                offset += SIMD_STRIDE_SIZE;
630                            }
631                        } else {
632                            loop {
633                                // Safety: only src is aligned here. We're valid for reading/writing double stride from the initial condition
634                                // and the loop break condition below
635                                if let Some(advance) =
636                                    $double_stride_src_aligned(src.add(offset), dst.add(offset))
637                                {
638                                    offset += advance;
639                                    let code_unit = *(src.add(offset));
640                                    // Safety: uses safety-usable invariant on ascii_to_ascii_simd_double_stride to return
641                                    // guaranteed non-ascii
642                                    return Some((code_unit, offset));
643                                }
644                                offset += SIMD_STRIDE_SIZE * 2;
645                                // Safety: This is `offset > len - 2 * SIMD_STRIDE_SIZE` which means we always have at least `2 * SIMD_STRIDE_SIZE` elements to munch next time.
646
647                                if offset > len_minus_stride_times_two {
648                                    break;
649                                }
650                            }
651                            // Safety: We're valid for reading/writing one more, and can still assume alignment
652                            if offset + SIMD_STRIDE_SIZE <= len {
653                                if !$stride_src_aligned(src.add(offset), dst.add(offset)) {
654                                    break 'outer;
655                                }
656                                offset += SIMD_STRIDE_SIZE;
657                            }
658                        }
659                    } else {
660                        // At most two iterations, so unroll
661                        if offset + SIMD_STRIDE_SIZE <= len {
662                            // Safety: The check above ensures we're allowed to read/write this, and we don't use alignment
663                            if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
664                                break;
665                            }
666                            offset += SIMD_STRIDE_SIZE;
667                            if offset + SIMD_STRIDE_SIZE <= len {
668                                // Safety: The check above ensures we're allowed to read/write this, and we don't use alignment
669                                if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
670                                    break;
671                                }
672                                offset += SIMD_STRIDE_SIZE;
673                            }
674                        }
675                    }
676                }
677                break 'outer;
678            }
679            while offset < len {
680                // Safety: relies straightforwardly on the `len` invariant
681                let code_unit = *(src.add(offset));
682                if code_unit > 127 {
683                    // Safety-usable invariant upheld here
684                    return Some((code_unit, offset));
685                }
686                *(dst.add(offset)) = code_unit as $dst_unit;
687                offset += 1;
688            }
689            None
690        }
691    };
692}
693
694#[allow(unused_macros)]
695macro_rules! latin1_simd_check_align {
696    (
697        $name:ident,
698        $src_unit:ty,
699        $dst_unit:ty,
700        // Safety: This function must require aligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
701        $stride_both_aligned:ident,
702        // Safety: This function must require aligned/unaligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
703        $stride_src_aligned:ident,
704        // Safety: This function must require unaligned/aligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
705        $stride_dst_aligned:ident,
706        // Safety: This function must require unaligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
707        $stride_neither_aligned:ident
708
709    ) => {
710        /// Safety: src/dst must be valid for reads/writes of `len` elements of their units.
711        #[inline(always)]
712        pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
713            let mut offset = 0usize;
714            // Safety: if this check succeeds we're valid for reading/writing at least `SIMD_STRIDE_SIZE` elements.
715            if SIMD_STRIDE_SIZE <= len {
716                let len_minus_stride = len - SIMD_STRIDE_SIZE;
717                // Whether dst is aligned
718                let dst_masked = (dst as usize) & SIMD_ALIGNMENT_MASK;
719                // Whether src is aligned
720                if ((src as usize) & SIMD_ALIGNMENT_MASK) == 0 {
721                    if dst_masked == 0 {
722                        loop {
723                            // Safety: Both were aligned, we can use the aligned function
724                            $stride_both_aligned(src.add(offset), dst.add(offset));
725                            offset += SIMD_STRIDE_SIZE;
726                            // Safety: This is `offset > len - SIMD_STRIDE_SIZE`, which means in the next iteration we're valid for
727                            // reading/writing at least SIMD_STRIDE_SIZE elements.
728                            if offset > len_minus_stride {
729                                break;
730                            }
731                        }
732                    } else {
733                        loop {
734                            // Safety: src was aligned, dst was not
735                            $stride_src_aligned(src.add(offset), dst.add(offset));
736                            offset += SIMD_STRIDE_SIZE;
737                            // Safety: This is `offset > len - SIMD_STRIDE_SIZE`, which means in the next iteration we're valid for
738                            // reading/writing at least SIMD_STRIDE_SIZE elements.
739                            if offset > len_minus_stride {
740                                break;
741                            }
742                        }
743                    }
744                } else {
745                    if dst_masked == 0 {
746                        loop {
747                            // Safety: src was aligned, dst was not
748                            $stride_dst_aligned(src.add(offset), dst.add(offset));
749                            offset += SIMD_STRIDE_SIZE;
750                            // Safety: This is `offset > len - SIMD_STRIDE_SIZE`, which means in the next iteration we're valid for
751                            // reading/writing at least SIMD_STRIDE_SIZE elements.
752                            if offset > len_minus_stride {
753                                break;
754                            }
755                        }
756                    } else {
757                        loop {
758                            // Safety: Neither were aligned
759                            $stride_neither_aligned(src.add(offset), dst.add(offset));
760                            offset += SIMD_STRIDE_SIZE;
761                            // Safety: This is `offset > len - SIMD_STRIDE_SIZE`, which means in the next iteration we're valid for
762                            // reading/writing at least SIMD_STRIDE_SIZE elements.
763                            if offset > len_minus_stride {
764                                break;
765                            }
766                        }
767                    }
768                }
769            }
770            while offset < len {
771                // Safety: relies straightforwardly on the `len` invariant
772                let code_unit = *(src.add(offset));
773                *(dst.add(offset)) = code_unit as $dst_unit;
774                offset += 1;
775            }
776        }
777    };
778}
779
780#[allow(unused_macros)]
781macro_rules! latin1_simd_check_align_unrolled {
782    (
783        $name:ident,
784        $src_unit:ty,
785        $dst_unit:ty,
786        // Safety: This function must require aligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
787        $stride_both_aligned:ident,
788        // Safety: This function must require aligned/unaligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
789        $stride_src_aligned:ident,
790        // Safety: This function must require unaligned/aligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
791        $stride_dst_aligned:ident,
792        // Safety: This function must require unaligned src/dest that are valid for reading/writing SIMD_STRIDE_SIZE src_unit/dst_unit
793        $stride_neither_aligned:ident
794    ) => {
795        /// Safety: src/dst must be valid for reads/writes of `len` elements of their units.
796        #[inline(always)]
797        pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
798            let unit_size = ::core::mem::size_of::<$src_unit>();
799            let mut offset = 0usize;
800            // Safety: if this check succeeds we're valid for reading/writing at least `SIMD_STRIDE_SIZE` elements.
801            if SIMD_STRIDE_SIZE <= len {
802                // Safety: this correctly calculates the number of src_units that need to be read before the remaining list is aligned.
803                // This is by definition less than SIMD_STRIDE_SIZE.
804                let mut until_alignment = ((SIMD_STRIDE_SIZE
805                    - ((src as usize) & SIMD_ALIGNMENT_MASK))
806                    & SIMD_ALIGNMENT_MASK)
807                    / unit_size;
808                while until_alignment != 0 {
809                    // Safety: This is a straightforward copy, since until_alignment is < SIMD_STRIDE_SIZE < len, this is in-bounds
810                    *(dst.add(offset)) = *(src.add(offset)) as $dst_unit;
811                    offset += 1;
812                    until_alignment -= 1;
813                }
814                // Safety: here offset will be `until_alignment`, i.e. enough to align `src`.
815                let len_minus_stride = len - SIMD_STRIDE_SIZE;
816                // Safety: if this check succeeds we're valid for reading/writing at least `2 * SIMD_STRIDE_SIZE` elements.
817                if offset + SIMD_STRIDE_SIZE * 2 <= len {
818                    let len_minus_stride_times_two = len_minus_stride - SIMD_STRIDE_SIZE;
819                    // Safety: at this point src is known to be aligned at offset, dst is not.
820                    if (dst.add(offset) as usize) & SIMD_ALIGNMENT_MASK == 0 {
821                        loop {
822                            // Safety: We checked alignment of dst above, we can use the alignment functions. We're allowed to read/write 2*SIMD_STRIDE_SIZE elements, which we do.
823                            $stride_both_aligned(src.add(offset), dst.add(offset));
824                            offset += SIMD_STRIDE_SIZE;
825                            $stride_both_aligned(src.add(offset), dst.add(offset));
826                            offset += SIMD_STRIDE_SIZE;
827                            // Safety: This is `offset > len - 2 * SIMD_STRIDE_SIZE` which means we always have at least `2 * SIMD_STRIDE_SIZE` elements to munch next time.
828                            if offset > len_minus_stride_times_two {
829                                break;
830                            }
831                        }
832                    } else {
833                        loop {
834                            // Safety: we ensured alignment of src already.
835                            $stride_src_aligned(src.add(offset), dst.add(offset));
836                            offset += SIMD_STRIDE_SIZE;
837                            $stride_src_aligned(src.add(offset), dst.add(offset));
838                            offset += SIMD_STRIDE_SIZE;
839                            // Safety: This is `offset > len - 2 * SIMD_STRIDE_SIZE` which means we always have at least `2 * SIMD_STRIDE_SIZE` elements to munch next time.
840                            if offset > len_minus_stride_times_two {
841                                break;
842                            }
843                        }
844                    }
845                }
846                // Safety: This is `offset > len - SIMD_STRIDE_SIZE` which means we are valid to munch SIMD_STRIDE_SIZE more elements, which we do
847                if offset < len_minus_stride {
848                    $stride_src_aligned(src.add(offset), dst.add(offset));
849                    offset += SIMD_STRIDE_SIZE;
850                }
851            }
852            while offset < len {
853                // Safety: uses len invariant here and below
854                let code_unit = *(src.add(offset));
855                // On x86_64, this loop autovectorizes but in the pack
856                // case there are instructions whose purpose is to make sure
857                // each u16 in the vector is truncated before packing. However,
858                // since we don't care about saturating behavior of SSE2 packing
859                // when the input isn't Latin1, those instructions are useless.
860                // Unfortunately, using the `assume` intrinsic to lie to the
861                // optimizer doesn't make LLVM omit the trunctation that we
862                // don't need. Possibly this loop could be manually optimized
863                // to do the sort of thing that LLVM does but without the
864                // ANDing the read vectors of u16 with a constant that discards
865                // the high half of each u16. As far as I can tell, the
866                // optimization assumes that doing a SIMD read past the end of
867                // the array is OK.
868                *(dst.add(offset)) = code_unit as $dst_unit;
869                offset += 1;
870            }
871        }
872    };
873}
874
875#[allow(unused_macros)]
876macro_rules! ascii_simd_unalign {
877    // Safety: stride_neither_aligned must be a function that requires src/dest be valid for unaligned reads/writes for SIMD_STRIDE_SIZE elements of type src_unit/dest_unit
878    ($name:ident, $src_unit:ty, $dst_unit:ty, $stride_neither_aligned:ident) => {
879        /// Safety: src and dst must be valid for reads/writes of len elements of type src_unit/dst_unit
880        ///
881        /// Safety-usable invariant: will return Some() when it encounters non-ASCII, with the first element in the Some being
882        /// guaranteed to be non-ASCII (> 127), and the second being the offset where it is found
883        #[inline(always)]
884        pub unsafe fn $name(
885            src: *const $src_unit,
886            dst: *mut $dst_unit,
887            len: usize,
888        ) -> Option<($src_unit, usize)> {
889            let mut offset = 0usize;
890            // Safety: if this check succeeds we're valid for reading/writing at least `stride` elements.
891            if SIMD_STRIDE_SIZE <= len {
892                let len_minus_stride = len - SIMD_STRIDE_SIZE;
893                loop {
894                    // Safety: We know we're valid for `stride` reads/writes, so we can call this function. We don't need alignment.
895                    if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
896                        break;
897                    }
898                    offset += SIMD_STRIDE_SIZE;
899                    // This is `offset > len - stride` which means we always have at least `stride` elements to munch next time.
900                    if offset > len_minus_stride {
901                        break;
902                    }
903                }
904            }
905            while offset < len {
906                // Safety: Uses len invariant here and below
907                let code_unit = *(src.add(offset));
908                if code_unit > 127 {
909                    // Safety-usable invariant upheld here
910                    return Some((code_unit, offset));
911                }
912                *(dst.add(offset)) = code_unit as $dst_unit;
913                offset += 1;
914            }
915            None
916        }
917    };
918}
919
920#[allow(unused_macros)]
921macro_rules! latin1_simd_unalign {
922    // Safety: stride_neither_aligned must be a function that requires src/dest be valid for unaligned reads/writes for SIMD_STRIDE_SIZE elements of type src_unit/dest_unit
923    ($name:ident, $src_unit:ty, $dst_unit:ty, $stride_neither_aligned:ident) => {
924        /// Safety: src and dst must be valid for unaligned reads/writes of len elements of type src_unit/dst_unit
925        #[inline(always)]
926        pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
927            let mut offset = 0usize;
928            // Safety: if this check succeeds we're valid for reading/writing at least `stride` elements.
929            if SIMD_STRIDE_SIZE <= len {
930                let len_minus_stride = len - SIMD_STRIDE_SIZE;
931                loop {
932                    // Safety: We know we're valid for `stride` reads/writes, so we can call this function. We don't need alignment.
933                    $stride_neither_aligned(src.add(offset), dst.add(offset));
934                    offset += SIMD_STRIDE_SIZE;
935                    // This is `offset > len - stride` which means we always have at least `stride` elements to munch next time.
936                    if offset > len_minus_stride {
937                        break;
938                    }
939                }
940            }
941            while offset < len {
942                // Safety: Uses len invariant here
943                let code_unit = *(src.add(offset));
944                *(dst.add(offset)) = code_unit as $dst_unit;
945                offset += 1;
946            }
947        }
948    };
949}
950
951#[allow(unused_macros)]
952macro_rules! ascii_to_ascii_simd_stride {
953    // Safety: load/store must be valid for 16 bytes of read/write, which may be unaligned. (candidates: `(load|store)(16|8)_(unaligned|aligned)` functions)
954    ($name:ident, $load:ident, $store:ident) => {
955        /// Safety: src and dst must be valid for 16 bytes of read/write according to
956        /// the $load/$store fn, which may allow for unaligned reads/writes or require
957        /// alignment to either 16x8 or u8x16.
958        #[inline(always)]
959        pub unsafe fn $name(src: *const u8, dst: *mut u8) -> bool {
960            let simd = $load(src);
961            if !simd_is_ascii(simd) {
962                return false;
963            }
964            $store(dst, simd);
965            true
966        }
967    };
968}
969
970#[allow(unused_macros)]
971macro_rules! ascii_to_ascii_simd_double_stride {
972    // Safety: store must be valid for 32 bytes of write, which may be unaligned (candidates: `store(8|16)_(aligned|unaligned)`)
973    ($name:ident, $store:ident) => {
974        /// Safety: src must be valid for 32 bytes of aligned u8x16 read
975        /// dst must be valid for 32 bytes of unaligned write according to
976        /// the $store fn, which may allow for unaligned writes or require
977        /// alignment to either 16x8 or u8x16.
978        ///
979        /// Safety-usable invariant: Returns Some(index) if the element at `index` is invalid ASCII
980        #[inline(always)]
981        pub unsafe fn $name(src: *const u8, dst: *mut u8) -> Option<usize> {
982            let first = load16_aligned(src);
983            let second = load16_aligned(src.add(SIMD_STRIDE_SIZE));
984            $store(dst, first);
985            if unlikely(!simd_is_ascii(first | second)) {
986                // Safety: mask_ascii produces a mask of all the high bits.
987                let mask_first = mask_ascii(first);
988                if mask_first != 0 {
989                    // Safety: on little endian systems this will be the number of ascii bytes
990                    // before the first non-ascii, i.e. valid for indexing src
991                    // TODO SAFETY: What about big-endian systems?
992                    return Some(mask_first.trailing_zeros() as usize);
993                }
994                $store(dst.add(SIMD_STRIDE_SIZE), second);
995                let mask_second = mask_ascii(second);
996                // Safety: on little endian systems this will be the number of ascii bytes
997                // before the first non-ascii, i.e. valid for indexing src
998                return Some(SIMD_STRIDE_SIZE + mask_second.trailing_zeros() as usize);
999            }
1000            $store(dst.add(SIMD_STRIDE_SIZE), second);
1001            None
1002        }
1003    };
1004}
1005
1006#[allow(unused_macros)]
1007macro_rules! ascii_to_basic_latin_simd_stride {
1008    // Safety: load/store must be valid for 16 bytes of read/write, which may be unaligned. (candidates: `(load|store)(16|8)_(unaligned|aligned)` functions)
1009    ($name:ident, $load:ident, $store:ident) => {
1010        /// Safety: src and dst must be valid for 16/32 bytes of read/write according to
1011        /// the $load/$store fn, which may allow for unaligned reads/writes or require
1012        /// alignment to either 16x8 or u8x16.
1013        #[inline(always)]
1014        pub unsafe fn $name(src: *const u8, dst: *mut u16) -> bool {
1015            let simd = $load(src);
1016            if !simd_is_ascii(simd) {
1017                return false;
1018            }
1019            let (first, second) = simd_unpack(simd);
1020            $store(dst, first);
1021            $store(dst.add(8), second);
1022            true
1023        }
1024    };
1025}
1026
1027#[allow(unused_macros)]
1028macro_rules! ascii_to_basic_latin_simd_double_stride {
1029    // Safety: store must be valid for 16 bytes of write, which may be unaligned
1030    ($name:ident, $store:ident) => {
1031        /// Safety: src must be valid for 2*SIMD_STRIDE_SIZE bytes of aligned reads,
1032        /// aligned to either 16x8 or u8x16.
1033        /// dst must be valid for 2*SIMD_STRIDE_SIZE bytes of aligned or unaligned reads
1034        #[inline(always)]
1035        pub unsafe fn $name(src: *const u8, dst: *mut u16) -> Option<usize> {
1036            let first = load16_aligned(src);
1037            let second = load16_aligned(src.add(SIMD_STRIDE_SIZE));
1038            let (a, b) = simd_unpack(first);
1039            $store(dst, a);
1040            // Safety: divide by 2 since it's a u16 pointer
1041            $store(dst.add(SIMD_STRIDE_SIZE / 2), b);
1042            if unlikely(!simd_is_ascii(first | second)) {
1043                let mask_first = mask_ascii(first);
1044                if mask_first != 0 {
1045                    return Some(mask_first.trailing_zeros() as usize);
1046                }
1047                let (c, d) = simd_unpack(second);
1048                $store(dst.add(SIMD_STRIDE_SIZE), c);
1049                $store(dst.add(SIMD_STRIDE_SIZE + (SIMD_STRIDE_SIZE / 2)), d);
1050                let mask_second = mask_ascii(second);
1051                return Some(SIMD_STRIDE_SIZE + mask_second.trailing_zeros() as usize);
1052            }
1053            let (c, d) = simd_unpack(second);
1054            $store(dst.add(SIMD_STRIDE_SIZE), c);
1055            $store(dst.add(SIMD_STRIDE_SIZE + (SIMD_STRIDE_SIZE / 2)), d);
1056            None
1057        }
1058    };
1059}
1060
1061#[allow(unused_macros)]
1062macro_rules! unpack_simd_stride {
1063    // Safety: load/store must be valid for 16 bytes of read/write, which may be unaligned. (candidates: `(load|store)(16|8)_(unaligned|aligned)` functions)
1064    ($name:ident, $load:ident, $store:ident) => {
1065        /// Safety: src and dst must be valid for 16 bytes of read/write according to
1066        /// the $load/$store fn, which may allow for unaligned reads/writes or require
1067        /// alignment to either 16x8 or u8x16.
1068        #[inline(always)]
1069        pub unsafe fn $name(src: *const u8, dst: *mut u16) {
1070            let simd = $load(src);
1071            let (first, second) = simd_unpack(simd);
1072            $store(dst, first);
1073            $store(dst.add(8), second);
1074        }
1075    };
1076}
1077
1078#[allow(unused_macros)]
1079macro_rules! basic_latin_to_ascii_simd_stride {
1080    // Safety: load/store must be valid for 16 bytes of read/write, which may be unaligned. (candidates: `(load|store)(16|8)_(unaligned|aligned)` functions)
1081    ($name:ident, $load:ident, $store:ident) => {
1082        /// Safety: src and dst must be valid for 32/16 bytes of read/write according to
1083        /// the $load/$store fn, which may allow for unaligned reads/writes or require
1084        /// alignment to either 16x8 or u8x16.
1085        #[inline(always)]
1086        pub unsafe fn $name(src: *const u16, dst: *mut u8) -> bool {
1087            let first = $load(src);
1088            let second = $load(src.add(8));
1089            if simd_is_basic_latin(first | second) {
1090                $store(dst, simd_pack(first, second));
1091                true
1092            } else {
1093                false
1094            }
1095        }
1096    };
1097}
1098
1099#[allow(unused_macros)]
1100macro_rules! pack_simd_stride {
1101    // Safety: load/store must be valid for 16 bytes of read/write, which may be unaligned. (candidates: `(load|store)(16|8)_(unaligned|aligned)` functions)
1102    ($name:ident, $load:ident, $store:ident) => {
1103        /// Safety: src and dst must be valid for 32/16 bytes of read/write according to
1104        /// the $load/$store fn, which may allow for unaligned reads/writes or require
1105        /// alignment to either 16x8 or u8x16.
1106        #[inline(always)]
1107        pub unsafe fn $name(src: *const u16, dst: *mut u8) {
1108            let first = $load(src);
1109            let second = $load(src.add(8));
1110            $store(dst, simd_pack(first, second));
1111        }
1112    };
1113}
1114
1115cfg_if! {
1116    if #[cfg(all(feature = "simd-accel", target_endian = "little", target_arch = "aarch64"))] {
1117        // SIMD with the same instructions for aligned and unaligned loads and stores
1118
1119        pub const SIMD_STRIDE_SIZE: usize = 16;
1120
1121        pub const MAX_STRIDE_SIZE: usize = 16;
1122
1123//        pub const ALIGNMENT: usize = 8;
1124
1125        pub const ALU_STRIDE_SIZE: usize = 16;
1126
1127        pub const ALU_ALIGNMENT: usize = 8;
1128
1129        pub const ALU_ALIGNMENT_MASK: usize = 7;
1130
1131        // Safety for stride macros: We stick to the load8_aligned/etc family of functions. We consistently produce
1132        // neither_unaligned variants using only unaligned inputs.
1133        ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_neither_aligned, load16_unaligned, store16_unaligned);
1134
1135        ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_neither_aligned, load16_unaligned, store8_unaligned);
1136        unpack_simd_stride!(unpack_stride_neither_aligned, load16_unaligned, store8_unaligned);
1137
1138        basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_neither_aligned, load8_unaligned, store16_unaligned);
1139        pack_simd_stride!(pack_stride_neither_aligned, load8_unaligned, store16_unaligned);
1140
1141        // Safety for conversion macros: We use the unalign macro with unalign functions above. All stride functions were produced
1142        // by stride macros that universally munch a single SIMD_STRIDE_SIZE worth of elements.
1143        ascii_simd_unalign!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride_neither_aligned);
1144        ascii_simd_unalign!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_neither_aligned);
1145        ascii_simd_unalign!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_neither_aligned);
1146        latin1_simd_unalign!(unpack_latin1, u8, u16, unpack_stride_neither_aligned);
1147        latin1_simd_unalign!(pack_latin1, u16, u8, pack_stride_neither_aligned);
1148    } else if #[cfg(all(feature = "simd-accel", target_endian = "little", target_feature = "neon"))] {
1149        // SIMD with different instructions for aligned and unaligned loads and stores.
1150        //
1151        // Newer microarchitectures are not supposed to have a performance difference between
1152        // aligned and unaligned SSE2 loads and stores when the address is actually aligned,
1153        // but the benchmark results I see don't agree.
1154
1155        pub const SIMD_STRIDE_SIZE: usize = 16;
1156
1157        pub const MAX_STRIDE_SIZE: usize = 16;
1158
1159        pub const SIMD_ALIGNMENT_MASK: usize = 15;
1160
1161        // Safety for stride macros: We stick to the load8_aligned/etc family of functions. We consistently name
1162        // aligned/unaligned functions according to src/dst being aligned/unaligned
1163
1164        ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_both_aligned, load16_aligned, store16_aligned);
1165        ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_src_aligned, load16_aligned, store16_unaligned);
1166        ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_dst_aligned, load16_unaligned, store16_aligned);
1167        ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_neither_aligned, load16_unaligned, store16_unaligned);
1168
1169        ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_both_aligned, load16_aligned, store8_aligned);
1170        ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_src_aligned, load16_aligned, store8_unaligned);
1171        ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_dst_aligned, load16_unaligned, store8_aligned);
1172        ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_neither_aligned, load16_unaligned, store8_unaligned);
1173
1174        unpack_simd_stride!(unpack_stride_both_aligned, load16_aligned, store8_aligned);
1175        unpack_simd_stride!(unpack_stride_src_aligned, load16_aligned, store8_unaligned);
1176        unpack_simd_stride!(unpack_stride_dst_aligned, load16_unaligned, store8_aligned);
1177        unpack_simd_stride!(unpack_stride_neither_aligned, load16_unaligned, store8_unaligned);
1178
1179        basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_both_aligned, load8_aligned, store16_aligned);
1180        basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_src_aligned, load8_aligned, store16_unaligned);
1181        basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_dst_aligned, load8_unaligned, store16_aligned);
1182        basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_neither_aligned, load8_unaligned, store16_unaligned);
1183
1184        pack_simd_stride!(pack_stride_both_aligned, load8_aligned, store16_aligned);
1185        pack_simd_stride!(pack_stride_src_aligned, load8_aligned, store16_unaligned);
1186        pack_simd_stride!(pack_stride_dst_aligned, load8_unaligned, store16_aligned);
1187        pack_simd_stride!(pack_stride_neither_aligned, load8_unaligned, store16_unaligned);
1188
1189        // Safety for conversion macros: We use the correct pattern of both/src/dst/neither here. All stride functions were produced
1190        // by stride macros that universally munch a single SIMD_STRIDE_SIZE worth of elements.
1191
1192        ascii_simd_check_align!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride_both_aligned, ascii_to_ascii_stride_src_aligned, ascii_to_ascii_stride_dst_aligned, ascii_to_ascii_stride_neither_aligned);
1193        ascii_simd_check_align!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_both_aligned, ascii_to_basic_latin_stride_src_aligned, ascii_to_basic_latin_stride_dst_aligned, ascii_to_basic_latin_stride_neither_aligned);
1194        ascii_simd_check_align!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_both_aligned, basic_latin_to_ascii_stride_src_aligned, basic_latin_to_ascii_stride_dst_aligned, basic_latin_to_ascii_stride_neither_aligned);
1195        latin1_simd_check_align!(unpack_latin1, u8, u16, unpack_stride_both_aligned, unpack_stride_src_aligned, unpack_stride_dst_aligned, unpack_stride_neither_aligned);
1196        latin1_simd_check_align!(pack_latin1, u16, u8, pack_stride_both_aligned, pack_stride_src_aligned, pack_stride_dst_aligned, pack_stride_neither_aligned);
1197    } else if #[cfg(all(feature = "simd-accel", target_feature = "sse2"))] {
1198        // SIMD with different instructions for aligned and unaligned loads and stores.
1199        //
1200        // Newer microarchitectures are not supposed to have a performance difference between
1201        // aligned and unaligned SSE2 loads and stores when the address is actually aligned,
1202        // but the benchmark results I see don't agree.
1203
1204        pub const SIMD_STRIDE_SIZE: usize = 16;
1205
1206        /// Safety-usable invariant: This should be identical to SIMD_STRIDE_SIZE (used by ascii_simd_check_align_unrolled)
1207        pub const SIMD_ALIGNMENT: usize = 16;
1208
1209        pub const MAX_STRIDE_SIZE: usize = 16;
1210
1211        pub const SIMD_ALIGNMENT_MASK: usize = 15;
1212
1213        // Safety for stride macros: We stick to the load8_aligned/etc family of functions. We consistently name
1214        // aligned/unaligned functions according to src/dst being aligned/unaligned
1215
1216        ascii_to_ascii_simd_double_stride!(ascii_to_ascii_simd_double_stride_both_aligned, store16_aligned);
1217        ascii_to_ascii_simd_double_stride!(ascii_to_ascii_simd_double_stride_src_aligned, store16_unaligned);
1218
1219        ascii_to_basic_latin_simd_double_stride!(ascii_to_basic_latin_simd_double_stride_both_aligned, store8_aligned);
1220        ascii_to_basic_latin_simd_double_stride!(ascii_to_basic_latin_simd_double_stride_src_aligned, store8_unaligned);
1221
1222        ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_both_aligned, load16_aligned, store16_aligned);
1223        ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_src_aligned, load16_aligned, store16_unaligned);
1224        ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_neither_aligned, load16_unaligned, store16_unaligned);
1225
1226        ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_both_aligned, load16_aligned, store8_aligned);
1227        ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_src_aligned, load16_aligned, store8_unaligned);
1228        ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_neither_aligned, load16_unaligned, store8_unaligned);
1229
1230        unpack_simd_stride!(unpack_stride_both_aligned, load16_aligned, store8_aligned);
1231        unpack_simd_stride!(unpack_stride_src_aligned, load16_aligned, store8_unaligned);
1232
1233        basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_both_aligned, load8_aligned, store16_aligned);
1234        basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_src_aligned, load8_aligned, store16_unaligned);
1235        basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_dst_aligned, load8_unaligned, store16_aligned);
1236        basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_neither_aligned, load8_unaligned, store16_unaligned);
1237
1238        pack_simd_stride!(pack_stride_both_aligned, load8_aligned, store16_aligned);
1239        pack_simd_stride!(pack_stride_src_aligned, load8_aligned, store16_unaligned);
1240
1241        // Safety for conversion macros: We use the correct pattern of both/src/dst/neither/double_both/double_src here. All stride functions were produced
1242        // by stride macros that universally munch a single SIMD_STRIDE_SIZE worth of elements.
1243
1244        ascii_simd_check_align_unrolled!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride_both_aligned, ascii_to_ascii_stride_src_aligned, ascii_to_ascii_stride_neither_aligned, ascii_to_ascii_simd_double_stride_both_aligned, ascii_to_ascii_simd_double_stride_src_aligned);
1245        ascii_simd_check_align_unrolled!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_both_aligned, ascii_to_basic_latin_stride_src_aligned, ascii_to_basic_latin_stride_neither_aligned, ascii_to_basic_latin_simd_double_stride_both_aligned, ascii_to_basic_latin_simd_double_stride_src_aligned);
1246
1247        ascii_simd_check_align!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_both_aligned, basic_latin_to_ascii_stride_src_aligned, basic_latin_to_ascii_stride_dst_aligned, basic_latin_to_ascii_stride_neither_aligned);
1248        latin1_simd_check_align_unrolled!(unpack_latin1, u8, u16, unpack_stride_both_aligned, unpack_stride_src_aligned, unpack_stride_dst_aligned, unpack_stride_neither_aligned);
1249        latin1_simd_check_align_unrolled!(pack_latin1, u16, u8, pack_stride_both_aligned, pack_stride_src_aligned, pack_stride_dst_aligned, pack_stride_neither_aligned);
1250    } else if #[cfg(all(target_endian = "little", target_pointer_width = "64"))] {
1251        // Aligned ALU word, little-endian, 64-bit
1252
1253        /// Safety invariant: this is the amount of bytes consumed by
1254        /// unpack_alu. This will be twice the pointer width, as it consumes two usizes.
1255        /// This is also the number of bytes produced by pack_alu.
1256        /// This is also the number of u16 code units produced/consumed by unpack_alu/pack_alu respectively.
1257        pub const ALU_STRIDE_SIZE: usize = 16;
1258
1259        pub const MAX_STRIDE_SIZE: usize = 16;
1260
1261        // Safety invariant: this is the pointer width in bytes
1262        pub const ALU_ALIGNMENT: usize = 8;
1263
1264        // Safety invariant: this is a mask for getting the bits of a pointer not aligned to ALU_ALIGNMENT
1265        pub const ALU_ALIGNMENT_MASK: usize = 7;
1266
1267        /// Safety: dst must point to valid space for writing four `usize`s
1268        #[inline(always)]
1269        unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1270            let first = ((0x0000_0000_FF00_0000usize & word) << 24) |
1271                        ((0x0000_0000_00FF_0000usize & word) << 16) |
1272                        ((0x0000_0000_0000_FF00usize & word) << 8) |
1273                        (0x0000_0000_0000_00FFusize & word);
1274            let second = ((0xFF00_0000_0000_0000usize & word) >> 8) |
1275                         ((0x00FF_0000_0000_0000usize & word) >> 16) |
1276                         ((0x0000_FF00_0000_0000usize & word) >> 24) |
1277                         ((0x0000_00FF_0000_0000usize & word) >> 32);
1278            let third = ((0x0000_0000_FF00_0000usize & second_word) << 24) |
1279                        ((0x0000_0000_00FF_0000usize & second_word) << 16) |
1280                        ((0x0000_0000_0000_FF00usize & second_word) << 8) |
1281                        (0x0000_0000_0000_00FFusize & second_word);
1282            let fourth = ((0xFF00_0000_0000_0000usize & second_word) >> 8) |
1283                         ((0x00FF_0000_0000_0000usize & second_word) >> 16) |
1284                         ((0x0000_FF00_0000_0000usize & second_word) >> 24) |
1285                         ((0x0000_00FF_0000_0000usize & second_word) >> 32);
1286            // Safety: fn invariant used here
1287            *dst = first;
1288            *(dst.add(1)) = second;
1289            *(dst.add(2)) = third;
1290            *(dst.add(3)) = fourth;
1291        }
1292
1293        /// Safety: dst must point to valid space for writing two `usize`s
1294        #[inline(always)]
1295        unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1296            let word = ((0x00FF_0000_0000_0000usize & second) << 8) |
1297                       ((0x0000_00FF_0000_0000usize & second) << 16) |
1298                       ((0x0000_0000_00FF_0000usize & second) << 24) |
1299                       ((0x0000_0000_0000_00FFusize & second) << 32) |
1300                       ((0x00FF_0000_0000_0000usize & first) >> 24) |
1301                       ((0x0000_00FF_0000_0000usize & first) >> 16) |
1302                       ((0x0000_0000_00FF_0000usize & first) >> 8) |
1303                       (0x0000_0000_0000_00FFusize & first);
1304            let second_word = ((0x00FF_0000_0000_0000usize & fourth) << 8) |
1305                              ((0x0000_00FF_0000_0000usize & fourth) << 16) |
1306                              ((0x0000_0000_00FF_0000usize & fourth) << 24) |
1307                              ((0x0000_0000_0000_00FFusize & fourth) << 32) |
1308                              ((0x00FF_0000_0000_0000usize & third) >> 24) |
1309                              ((0x0000_00FF_0000_0000usize & third) >> 16) |
1310                              ((0x0000_0000_00FF_0000usize & third) >> 8) |
1311                              (0x0000_0000_0000_00FFusize & third);
1312            // Safety: fn invariant used here
1313            *dst = word;
1314            *(dst.add(1)) = second_word;
1315        }
1316    } else if #[cfg(all(target_endian = "little", target_pointer_width = "32"))] {
1317        // Aligned ALU word, little-endian, 32-bit
1318
1319        /// Safety invariant: this is the amount of bytes consumed by
1320        /// unpack_alu. This will be twice the pointer width, as it consumes two usizes.
1321        /// This is also the number of bytes produced by pack_alu.
1322        /// This is also the number of u16 code units produced/consumed by unpack_alu/pack_alu respectively.
1323        pub const ALU_STRIDE_SIZE: usize = 8;
1324
1325        pub const MAX_STRIDE_SIZE: usize = 8;
1326
1327        // Safety invariant: this is the pointer width in bytes
1328        pub const ALU_ALIGNMENT: usize = 4;
1329
1330        // Safety invariant: this is a mask for getting the bits of a pointer not aligned to ALU_ALIGNMENT
1331        pub const ALU_ALIGNMENT_MASK: usize = 3;
1332
1333        /// Safety: dst must point to valid space for writing four `usize`s
1334        #[inline(always)]
1335        unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1336            let first = ((0x0000_FF00usize & word) << 8) |
1337                        (0x0000_00FFusize & word);
1338            let second = ((0xFF00_0000usize & word) >> 8) |
1339                         ((0x00FF_0000usize & word) >> 16);
1340            let third = ((0x0000_FF00usize & second_word) << 8) |
1341                        (0x0000_00FFusize & second_word);
1342            let fourth = ((0xFF00_0000usize & second_word) >> 8) |
1343                         ((0x00FF_0000usize & second_word) >> 16);
1344            // Safety: fn invariant used here
1345            *dst = first;
1346            *(dst.add(1)) = second;
1347            *(dst.add(2)) = third;
1348            *(dst.add(3)) = fourth;
1349        }
1350
1351        /// Safety: dst must point to valid space for writing two `usize`s
1352        #[inline(always)]
1353        unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1354            let word = ((0x00FF_0000usize & second) << 8) |
1355                       ((0x0000_00FFusize & second) << 16) |
1356                       ((0x00FF_0000usize & first) >> 8) |
1357                       (0x0000_00FFusize & first);
1358            let second_word = ((0x00FF_0000usize & fourth) << 8) |
1359                              ((0x0000_00FFusize & fourth) << 16) |
1360                              ((0x00FF_0000usize & third) >> 8) |
1361                              (0x0000_00FFusize & third);
1362            // Safety: fn invariant used here
1363            *dst = word;
1364            *(dst.add(1)) = second_word;
1365        }
1366    } else if #[cfg(all(target_endian = "big", target_pointer_width = "64"))] {
1367        // Aligned ALU word, big-endian, 64-bit
1368
1369        /// Safety invariant: this is the amount of bytes consumed by
1370        /// unpack_alu. This will be twice the pointer width, as it consumes two usizes.
1371        /// This is also the number of bytes produced by pack_alu.
1372        /// This is also the number of u16 code units produced/consumed by unpack_alu/pack_alu respectively.
1373        pub const ALU_STRIDE_SIZE: usize = 16;
1374
1375        pub const MAX_STRIDE_SIZE: usize = 16;
1376
1377        // Safety invariant: this is the pointer width in bytes
1378        pub const ALU_ALIGNMENT: usize = 8;
1379
1380        // Safety invariant: this is a mask for getting the bits of a pointer not aligned to ALU_ALIGNMENT
1381        pub const ALU_ALIGNMENT_MASK: usize = 7;
1382
1383        /// Safety: dst must point to valid space for writing four `usize`s
1384        #[inline(always)]
1385        unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1386            let first = ((0xFF00_0000_0000_0000usize & word) >> 8) |
1387                         ((0x00FF_0000_0000_0000usize & word) >> 16) |
1388                         ((0x0000_FF00_0000_0000usize & word) >> 24) |
1389                         ((0x0000_00FF_0000_0000usize & word) >> 32);
1390            let second = ((0x0000_0000_FF00_0000usize & word) << 24) |
1391                        ((0x0000_0000_00FF_0000usize & word) << 16) |
1392                        ((0x0000_0000_0000_FF00usize & word) << 8) |
1393                        (0x0000_0000_0000_00FFusize & word);
1394            let third = ((0xFF00_0000_0000_0000usize & second_word) >> 8) |
1395                         ((0x00FF_0000_0000_0000usize & second_word) >> 16) |
1396                         ((0x0000_FF00_0000_0000usize & second_word) >> 24) |
1397                         ((0x0000_00FF_0000_0000usize & second_word) >> 32);
1398            let fourth = ((0x0000_0000_FF00_0000usize & second_word) << 24) |
1399                        ((0x0000_0000_00FF_0000usize & second_word) << 16) |
1400                        ((0x0000_0000_0000_FF00usize & second_word) << 8) |
1401                        (0x0000_0000_0000_00FFusize & second_word);
1402            // Safety: fn invariant used here
1403            *dst = first;
1404            *(dst.add(1)) = second;
1405            *(dst.add(2)) = third;
1406            *(dst.add(3)) = fourth;
1407        }
1408
1409        /// Safety: dst must point to valid space for writing two `usize`s
1410        #[inline(always)]
1411        unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1412            let word = ((0x00FF0000_00000000usize & first) << 8) |
1413                       ((0x000000FF_00000000usize & first) << 16) |
1414                       ((0x00000000_00FF0000usize & first) << 24) |
1415                       ((0x00000000_000000FFusize & first) << 32) |
1416                       ((0x00FF0000_00000000usize & second) >> 24) |
1417                       ((0x000000FF_00000000usize & second) >> 16) |
1418                       ((0x00000000_00FF0000usize & second) >> 8) |
1419                       (0x00000000_000000FFusize & second);
1420            let second_word = ((0x00FF0000_00000000usize & third) << 8) |
1421                              ((0x000000FF_00000000usize & third) << 16) |
1422                              ((0x00000000_00FF0000usize & third) << 24) |
1423                              ((0x00000000_000000FFusize & third) << 32) |
1424                              ((0x00FF0000_00000000usize & fourth) >> 24) |
1425                              ((0x000000FF_00000000usize & fourth) >> 16) |
1426                              ((0x00000000_00FF0000usize & fourth) >> 8) |
1427                              (0x00000000_000000FFusize &  fourth);
1428            // Safety: fn invariant used here
1429            *dst = word;
1430            *(dst.add(1)) = second_word;
1431        }
1432    } else if #[cfg(all(target_endian = "big", target_pointer_width = "32"))] {
1433        // Aligned ALU word, big-endian, 32-bit
1434
1435        /// Safety invariant: this is the amount of bytes consumed by
1436        /// unpack_alu. This will be twice the pointer width, as it consumes two usizes.
1437        /// This is also the number of bytes produced by pack_alu.
1438        /// This is also the number of u16 code units produced/consumed by unpack_alu/pack_alu respectively.
1439        pub const ALU_STRIDE_SIZE: usize = 8;
1440
1441        pub const MAX_STRIDE_SIZE: usize = 8;
1442
1443        // Safety invariant: this is the pointer width in bytes
1444        pub const ALU_ALIGNMENT: usize = 4;
1445
1446        // Safety invariant: this is a mask for getting the bits of a pointer not aligned to ALU_ALIGNMENT
1447        pub const ALU_ALIGNMENT_MASK: usize = 3;
1448
1449        /// Safety: dst must point to valid space for writing four `usize`s
1450        #[inline(always)]
1451        unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1452            let first = ((0xFF00_0000usize & word) >> 8) |
1453                         ((0x00FF_0000usize & word) >> 16);
1454            let second = ((0x0000_FF00usize & word) << 8) |
1455                        (0x0000_00FFusize & word);
1456            let third = ((0xFF00_0000usize & second_word) >> 8) |
1457                         ((0x00FF_0000usize & second_word) >> 16);
1458            let fourth = ((0x0000_FF00usize & second_word) << 8) |
1459                        (0x0000_00FFusize & second_word);
1460            // Safety: fn invariant used here
1461            *dst = first;
1462            *(dst.add(1)) = second;
1463            *(dst.add(2)) = third;
1464            *(dst.add(3)) = fourth;
1465        }
1466
1467        /// Safety: dst must point to valid space for writing two `usize`s
1468        #[inline(always)]
1469        unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1470            let word = ((0x00FF_0000usize & first) << 8) |
1471                       ((0x0000_00FFusize & first) << 16) |
1472                       ((0x00FF_0000usize & second) >> 8) |
1473                       (0x0000_00FFusize & second);
1474            let second_word = ((0x00FF_0000usize & third) << 8) |
1475                              ((0x0000_00FFusize & third) << 16) |
1476                              ((0x00FF_0000usize & fourth) >> 8) |
1477                              (0x0000_00FFusize & fourth);
1478            // Safety: fn invariant used here
1479            *dst = word;
1480            *(dst.add(1)) = second_word;
1481        }
1482    } else {
1483        ascii_naive!(ascii_to_ascii, u8, u8);
1484        ascii_naive!(ascii_to_basic_latin, u8, u16);
1485        ascii_naive!(basic_latin_to_ascii, u16, u8);
1486    }
1487}
1488
1489cfg_if! {
1490    // Safety-usable invariant: this counts the zeroes from the "first byte" of utf-8 data packed into a usize
1491    // with the target endianness
1492    if #[cfg(target_endian = "little")] {
1493        #[allow(dead_code)]
1494        #[inline(always)]
1495        fn count_zeros(word: usize) -> u32 {
1496            word.trailing_zeros()
1497        }
1498    } else {
1499        #[allow(dead_code)]
1500        #[inline(always)]
1501        fn count_zeros(word: usize) -> u32 {
1502            word.leading_zeros()
1503        }
1504    }
1505}
1506
1507cfg_if! {
1508    if #[cfg(all(feature = "simd-accel", target_endian = "little", target_arch = "disabled"))] {
1509        /// Safety-usable invariant: Will return the value and position of the first non-ASCII byte in the slice in a Some if found.
1510        /// In other words, the first element of the Some is always `> 127`
1511        #[inline(always)]
1512        pub fn validate_ascii(slice: &[u8]) -> Option<(u8, usize)> {
1513            let src = slice.as_ptr();
1514            let len = slice.len();
1515            let mut offset = 0usize;
1516            // Safety: if this check succeeds we're valid for reading/writing at least `stride` elements.
1517            if SIMD_STRIDE_SIZE <= len {
1518                let len_minus_stride = len - SIMD_STRIDE_SIZE;
1519                loop {
1520                    // Safety: src at offset is valid for a `SIMD_STRIDE_SIZE` read
1521                    let simd = unsafe { load16_unaligned(src.add(offset)) };
1522                    if !simd_is_ascii(simd) {
1523                        break;
1524                    }
1525                    offset += SIMD_STRIDE_SIZE;
1526                    // This is `offset > len - SIMD_STRIDE_SIZE` which means we always have at least `SIMD_STRIDE_SIZE` elements to munch next time.
1527                    if offset > len_minus_stride {
1528                        break;
1529                    }
1530                }
1531            }
1532            while offset < len {
1533                let code_unit = slice[offset];
1534                if code_unit > 127 {
1535                    // Safety: Safety-usable invariant upheld here
1536                    return Some((code_unit, offset));
1537                }
1538                offset += 1;
1539            }
1540            None
1541        }
1542    } else if #[cfg(all(feature = "simd-accel", target_feature = "sse2"))] {
1543        /// Safety-usable invariant: will return Some() when it encounters non-ASCII, with the first element in the Some being
1544        /// guaranteed to be non-ASCII (> 127), and the second being the offset where it is found
1545        #[inline(always)]
1546        pub fn validate_ascii(slice: &[u8]) -> Option<(u8, usize)> {
1547            let src = slice.as_ptr();
1548            let len = slice.len();
1549            let mut offset = 0usize;
1550            // Safety: if this check succeeds we're valid for reading at least `stride` elements.
1551            if SIMD_STRIDE_SIZE <= len {
1552                // First, process one unaligned vector
1553                // Safety: src is valid for a `SIMD_STRIDE_SIZE` read
1554                let simd = unsafe { load16_unaligned(src) };
1555                let mask = mask_ascii(simd);
1556                if mask != 0 {
1557                    offset = mask.trailing_zeros() as usize;
1558                    let non_ascii = unsafe { *src.add(offset) };
1559                    return Some((non_ascii, offset));
1560                }
1561                offset = SIMD_STRIDE_SIZE;
1562                // Safety: Now that offset has changed we don't yet know how much it is valid for
1563
1564                // We have now seen 16 ASCII bytes. Let's guess that
1565                // there will be enough more to justify more expense
1566                // in the case of non-ASCII.
1567                // Use aligned reads for the sake of old microachitectures.
1568                // Safety: this correctly calculates the number of src_units that need to be read before the remaining list is aligned.
1569                // This is by definition less than SIMD_ALIGNMENT, which is defined to be equal to SIMD_STRIDE_SIZE.
1570                let until_alignment = unsafe { (SIMD_ALIGNMENT - ((src.add(offset) as usize) & SIMD_ALIGNMENT_MASK)) & SIMD_ALIGNMENT_MASK };
1571                // This addition won't overflow, because even in the 32-bit PAE case the
1572                // address space holds enough code that the slice length can't be that
1573                // close to address space size.
1574                // offset now equals SIMD_STRIDE_SIZE, hence times 3 below.
1575                //
1576                // Safety: if this check succeeds we're valid for reading at least `2 * SIMD_STRIDE_SIZE` elements plus `until_alignment`.
1577                // The extra SIMD_STRIDE_SIZE in the condition is because `offset` is already `SIMD_STRIDE_SIZE`.
1578                if until_alignment + (SIMD_STRIDE_SIZE * 3) <= len {
1579                    if until_alignment != 0 {
1580                        // Safety: this is safe to call since we're valid for this read (and more), and don't care about alignment
1581                        // This will copy over bytes that get decoded twice since it's not incrementing `offset` by SIMD_STRIDE_SIZE. This is fine.
1582                        let simd = unsafe { load16_unaligned(src.add(offset)) };
1583                        let mask = mask_ascii(simd);
1584                        if mask != 0 {
1585                            offset += mask.trailing_zeros() as usize;
1586                            let non_ascii = unsafe { *src.add(offset) };
1587                            return Some((non_ascii, offset));
1588                        }
1589                        offset += until_alignment;
1590                    }
1591                    // Safety: At this point we're valid for reading 2*SIMD_STRIDE_SIZE elements
1592                    // Safety: Now `offset` is aligned for `src`
1593                    let len_minus_stride_times_two = len - (SIMD_STRIDE_SIZE * 2);
1594                    loop {
1595                        // Safety: We were valid for this read, and were aligned.
1596                        let first = unsafe { load16_aligned(src.add(offset)) };
1597                        let second = unsafe { load16_aligned(src.add(offset + SIMD_STRIDE_SIZE)) };
1598                        if !simd_is_ascii(first | second) {
1599                            // Safety: mask_ascii produces a mask of all the high bits.
1600                            let mask_first = mask_ascii(first);
1601                            if mask_first != 0 {
1602                                // Safety: on little endian systems this will be the number of ascii bytes
1603                                // before the first non-ascii, i.e. valid for indexing src
1604                                // TODO SAFETY: What about big-endian systems?
1605                                offset += mask_first.trailing_zeros() as usize;
1606                            } else {
1607                                let mask_second = mask_ascii(second);
1608                                // Safety: on little endian systems this will be the number of ascii bytes
1609                                // before the first non-ascii, i.e. valid for indexing src
1610                                offset += SIMD_STRIDE_SIZE + mask_second.trailing_zeros() as usize;
1611                            }
1612                            // Safety: We know this is non-ASCII, and can uphold the safety-usable invariant here
1613                            let non_ascii = unsafe { *src.add(offset) };
1614
1615                            return Some((non_ascii, offset));
1616                        }
1617                        offset += SIMD_STRIDE_SIZE * 2;
1618                        // Safety: This is `offset > len - 2 * SIMD_STRIDE_SIZE` which means we always have at least `2 * SIMD_STRIDE_SIZE` elements to munch next time.
1619                        if offset > len_minus_stride_times_two {
1620                            break;
1621                        }
1622                    }
1623                    // Safety: if this check succeeds we're valid for reading at least `SIMD_STRIDE_SIZE`
1624                    if offset + SIMD_STRIDE_SIZE <= len {
1625                        // Safety: We were valid for this read, and were aligned.
1626                        let simd = unsafe { load16_aligned(src.add(offset)) };
1627                        // Safety: mask_ascii produces a mask of all the high bits.
1628                        let mask = mask_ascii(simd);
1629                        if mask != 0 {
1630                            // Safety: on little endian systems this will be the number of ascii bytes
1631                            // before the first non-ascii, i.e. valid for indexing src
1632                            offset += mask.trailing_zeros() as usize;
1633                            let non_ascii = unsafe { *src.add(offset) };
1634                            // Safety: We know this is non-ASCII, and can uphold the safety-usable invariant here
1635                            return Some((non_ascii, offset));
1636                        }
1637                        offset += SIMD_STRIDE_SIZE;
1638                    }
1639                } else {
1640                    // Safety: this is the unaligned branch
1641                    // At most two iterations, so unroll
1642                    // Safety: if this check succeeds we're valid for reading at least `SIMD_STRIDE_SIZE`
1643                    if offset + SIMD_STRIDE_SIZE <= len {
1644                        // Safety: We're valid for this read but must use an unaligned read
1645                        let simd = unsafe { load16_unaligned(src.add(offset)) };
1646                        let mask = mask_ascii(simd);
1647                        if mask != 0 {
1648                            offset += mask.trailing_zeros() as usize;
1649                            let non_ascii = unsafe { *src.add(offset) };
1650                            // Safety-usable invariant upheld here (same as above)
1651                            return Some((non_ascii, offset));
1652                        }
1653                        offset += SIMD_STRIDE_SIZE;
1654                        // Safety: if this check succeeds we're valid for reading at least `SIMD_STRIDE_SIZE`
1655                        if offset + SIMD_STRIDE_SIZE <= len {
1656                            // Safety: We're valid for this read but must use an unaligned read
1657                             let simd = unsafe { load16_unaligned(src.add(offset)) };
1658                             let mask = mask_ascii(simd);
1659                            if mask != 0 {
1660                                offset += mask.trailing_zeros() as usize;
1661                                let non_ascii = unsafe { *src.add(offset) };
1662                                // Safety-usable invariant upheld here (same as above)
1663                                return Some((non_ascii, offset));
1664                            }
1665                            offset += SIMD_STRIDE_SIZE;
1666                        }
1667                    }
1668                }
1669            }
1670            while offset < len {
1671                // Safety: relies straightforwardly on the `len` invariant
1672                let code_unit = unsafe { *(src.add(offset)) };
1673                if code_unit > 127 {
1674                    // Safety-usable invariant upheld here
1675                    return Some((code_unit, offset));
1676                }
1677                offset += 1;
1678            }
1679            None
1680        }
1681    } else {
1682        // Safety-usable invariant: returns byte index of first non-ascii byte
1683        #[inline(always)]
1684        fn find_non_ascii(word: usize, second_word: usize) -> Option<usize> {
1685            let word_masked = word & ASCII_MASK;
1686            let second_masked = second_word & ASCII_MASK;
1687            if (word_masked | second_masked) == 0 {
1688                // Both are ascii, invariant upheld
1689                return None;
1690            }
1691            if word_masked != 0 {
1692                let zeros = count_zeros(word_masked);
1693                // `zeros` now contains 0 to 7 (for the seven bits of masked ASCII in little endian,
1694                // or up to 7 bits of non-ASCII in big endian if the first byte is non-ASCII)
1695                // plus 8 times the number of ASCII in text order before the
1696                // non-ASCII byte in the little-endian case or 8 times the number of ASCII in
1697                // text order before the non-ASCII byte in the big-endian case.
1698                let num_ascii = (zeros >> 3) as usize;
1699                // Safety-usable invariant upheld here
1700                return Some(num_ascii);
1701            }
1702            let zeros = count_zeros(second_masked);
1703            // `zeros` now contains 0 to 7 (for the seven bits of masked ASCII in little endian,
1704            // or up to 7 bits of non-ASCII in big endian if the first byte is non-ASCII)
1705            // plus 8 times the number of ASCII in text order before the
1706            // non-ASCII byte in the little-endian case or 8 times the number of ASCII in
1707            // text order before the non-ASCII byte in the big-endian case.
1708            let num_ascii = (zeros >> 3) as usize;
1709            // Safety-usable invariant upheld here
1710            Some(ALU_ALIGNMENT + num_ascii)
1711        }
1712
1713        /// Safety: `src` must be valid for the reads of two `usize`s
1714        ///
1715        /// Safety-usable invariant: will return byte index of first non-ascii byte
1716        #[inline(always)]
1717        unsafe fn validate_ascii_stride(src: *const usize) -> Option<usize> {
1718            let word = *src;
1719            let second_word = *(src.add(1));
1720            find_non_ascii(word, second_word)
1721        }
1722
1723        /// Safety-usable invariant: will return Some() when it encounters non-ASCII, with the first element in the Some being
1724        /// guaranteed to be non-ASCII (> 127), and the second being the offset where it is found
1725        #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
1726        #[inline(always)]
1727        pub fn validate_ascii(slice: &[u8]) -> Option<(u8, usize)> {
1728            let src = slice.as_ptr();
1729            let len = slice.len();
1730            let mut offset = 0usize;
1731            let mut until_alignment = (ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) & ALU_ALIGNMENT_MASK;
1732            // Safety: If this check fails we're valid to read `until_alignment + ALU_STRIDE_SIZE` elements
1733            if until_alignment + ALU_STRIDE_SIZE <= len {
1734                while until_alignment != 0 {
1735                    let code_unit = slice[offset];
1736                    if code_unit > 127 {
1737                        // Safety-usable invairant upheld here
1738                        return Some((code_unit, offset));
1739                    }
1740                    offset += 1;
1741                    until_alignment -= 1;
1742                }
1743                // Safety: At this point we have read until_alignment elements and
1744                // are valid for `ALU_STRIDE_SIZE` more.
1745                let len_minus_stride = len - ALU_STRIDE_SIZE;
1746                loop {
1747                    // Safety: we were valid for this read
1748                    let ptr = unsafe { src.add(offset) as *const usize };
1749                    if let Some(num_ascii) = unsafe { validate_ascii_stride(ptr) } {
1750                        offset += num_ascii;
1751                        // Safety-usable invairant upheld here using the invariant from validate_ascii_stride()
1752                        return Some((unsafe { *(src.add(offset)) }, offset));
1753                    }
1754                    offset += ALU_STRIDE_SIZE;
1755                    // Safety: This is `offset > ALU_STRIDE_SIZE` which means we always have at least `2 * ALU_STRIDE_SIZE` elements to munch next time.
1756                    if offset > len_minus_stride {
1757                        break;
1758                    }
1759                }
1760            }
1761            while offset < len {
1762                let code_unit = slice[offset];
1763                if code_unit > 127 {
1764                    // Safety-usable invairant upheld here
1765                    return Some((code_unit, offset));
1766                }
1767                offset += 1;
1768           }
1769           None
1770        }
1771
1772    }
1773}
1774
1775cfg_if! {
1776    if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"))))] {
1777
1778    } else if #[cfg(all(feature = "simd-accel", target_endian = "little", target_feature = "neon"))] {
1779        // Even with NEON enabled, we use the ALU path for ASCII validation, because testing
1780        // on Exynos 5 indicated that using NEON isn't worthwhile where there are only
1781        // vector reads without vector writes.
1782
1783        pub const ALU_STRIDE_SIZE: usize = 8;
1784
1785        pub const ALU_ALIGNMENT: usize = 4;
1786
1787        pub const ALU_ALIGNMENT_MASK: usize = 3;
1788    } else {
1789        // Safety: src points to two valid `usize`s, dst points to four valid `usize`s
1790        #[inline(always)]
1791        unsafe fn unpack_latin1_stride_alu(src: *const usize, dst: *mut usize) {
1792            // Safety: src safety invariant used here
1793            let word = *src;
1794            let second_word = *(src.add(1));
1795            // Safety: dst safety invariant passed down
1796            unpack_alu(word, second_word, dst);
1797        }
1798
1799        // Safety: src points to four valid `usize`s, dst points to two valid `usize`s
1800        #[inline(always)]
1801        unsafe fn pack_latin1_stride_alu(src: *const usize, dst: *mut usize) {
1802            // Safety: src safety invariant used here
1803            let first = *src;
1804            let second = *(src.add(1));
1805            let third = *(src.add(2));
1806            let fourth = *(src.add(3));
1807            // Safety: dst safety invariant passed down
1808            pack_alu(first, second, third, fourth, dst);
1809        }
1810
1811        // Safety: src points to two valid `usize`s, dst points to four valid `usize`s
1812        #[inline(always)]
1813        unsafe fn ascii_to_basic_latin_stride_alu(src: *const usize, dst: *mut usize) -> bool {
1814            // Safety: src safety invariant used here
1815            let word = *src;
1816            let second_word = *(src.add(1));
1817            // Check if the words contains non-ASCII
1818            if (word & ASCII_MASK) | (second_word & ASCII_MASK) != 0 {
1819                return false;
1820            }
1821            // Safety: dst safety invariant passed down
1822            unpack_alu(word, second_word, dst);
1823            true
1824        }
1825
1826        // Safety: src points four valid `usize`s, dst points to two valid `usize`s
1827        #[inline(always)]
1828        unsafe fn basic_latin_to_ascii_stride_alu(src: *const usize, dst: *mut usize) -> bool {
1829            // Safety: src safety invariant used here
1830            let first = *src;
1831            let second = *(src.add(1));
1832            let third = *(src.add(2));
1833            let fourth = *(src.add(3));
1834            if (first & BASIC_LATIN_MASK) | (second & BASIC_LATIN_MASK) | (third & BASIC_LATIN_MASK) | (fourth & BASIC_LATIN_MASK) != 0 {
1835                return false;
1836            }
1837            // Safety: dst safety invariant passed down
1838            pack_alu(first, second, third, fourth, dst);
1839            true
1840        }
1841
1842        // Safety: src, dst both point to two valid `usize`s each
1843        // Safety-usable invariant: Will return byte index of first non-ascii byte.
1844        #[inline(always)]
1845        unsafe fn ascii_to_ascii_stride(src: *const usize, dst: *mut usize) -> Option<usize> {
1846            // Safety: src safety invariant used here
1847            let word = *src;
1848            let second_word = *(src.add(1));
1849            // Safety: src safety invariant used here
1850            *dst = word;
1851            *(dst.add(1)) = second_word;
1852            // Relies on safety-usable invariant here
1853            find_non_ascii(word, second_word)
1854        }
1855
1856        basic_latin_alu!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_alu);
1857        basic_latin_alu!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_alu);
1858        latin1_alu!(unpack_latin1, u8, u16, unpack_latin1_stride_alu);
1859        latin1_alu!(pack_latin1, u16, u8, pack_latin1_stride_alu);
1860        // Safety invariant upheld: ascii_to_ascii_stride will return byte index of first non-ascii if found
1861        ascii_alu!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride);
1862    }
1863}
1864
1865pub fn ascii_valid_up_to(bytes: &[u8]) -> usize {
1866    match validate_ascii(bytes) {
1867        None => bytes.len(),
1868        Some((_, num_valid)) => num_valid,
1869    }
1870}
1871
1872pub fn iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize {
1873    for (i, b_ref) in bytes.iter().enumerate() {
1874        let b = *b_ref;
1875        if b >= 0x80 || b == 0x1B || b == 0x0E || b == 0x0F {
1876            return i;
1877        }
1878    }
1879    bytes.len()
1880}
1881
1882// Any copyright to the test code below this comment is dedicated to the
1883// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
1884
1885#[cfg(all(test, feature = "alloc"))]
1886mod tests {
1887    use super::*;
1888    use alloc::vec::Vec;
1889
1890    macro_rules! test_ascii {
1891        ($test_name:ident, $fn_tested:ident, $src_unit:ty, $dst_unit:ty) => {
1892            #[test]
1893            fn $test_name() {
1894                let mut src: Vec<$src_unit> = Vec::with_capacity(32);
1895                let mut dst: Vec<$dst_unit> = Vec::with_capacity(32);
1896                for i in 0..32 {
1897                    src.clear();
1898                    dst.clear();
1899                    dst.resize(32, 0);
1900                    for j in 0..32 {
1901                        let c = if i == j { 0xAA } else { j + 0x40 };
1902                        src.push(c as $src_unit);
1903                    }
1904                    match unsafe { $fn_tested(src.as_ptr(), dst.as_mut_ptr(), 32) } {
1905                        None => unreachable!("Should always find non-ASCII"),
1906                        Some((non_ascii, num_ascii)) => {
1907                            assert_eq!(non_ascii, 0xAA);
1908                            assert_eq!(num_ascii, i);
1909                            for j in 0..i {
1910                                assert_eq!(dst[j], (j + 0x40) as $dst_unit);
1911                            }
1912                        }
1913                    }
1914                }
1915            }
1916        };
1917    }
1918
1919    test_ascii!(test_ascii_to_ascii, ascii_to_ascii, u8, u8);
1920    test_ascii!(test_ascii_to_basic_latin, ascii_to_basic_latin, u8, u16);
1921    test_ascii!(test_basic_latin_to_ascii, basic_latin_to_ascii, u16, u8);
1922}