encoding_rs/single_byte.rs
1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use super::*;
11use crate::ascii::*;
12use crate::data::position;
13use crate::handles::*;
14use crate::variant::*;
15
16pub struct SingleByteDecoder {
17 table: &'static [u16; 128],
18}
19
20impl SingleByteDecoder {
21 pub fn new(data: &'static [u16; 128]) -> VariantDecoder {
22 VariantDecoder::SingleByte(SingleByteDecoder { table: data })
23 }
24
25 pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
26 Some(byte_length)
27 }
28
29 pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
30 byte_length.checked_mul(3)
31 }
32
33 pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
34 byte_length.checked_mul(3)
35 }
36
37 pub fn decode_to_utf8_raw(
38 &mut self,
39 src: &[u8],
40 dst: &mut [u8],
41 _last: bool,
42 ) -> (DecoderResult, usize, usize) {
43 let mut source = ByteSource::new(src);
44 let mut dest = Utf8Destination::new(dst);
45 'outermost: loop {
46 match dest.copy_ascii_from_check_space_bmp(&mut source) {
47 CopyAsciiResult::Stop(ret) => return ret,
48 CopyAsciiResult::GoOn((mut non_ascii, mut handle)) => 'middle: loop {
49 // Start non-boilerplate
50 //
51 // Since the non-ASCIIness of `non_ascii` is hidden from
52 // the optimizer, it can't figure out that it's OK to
53 // statically omit the bound check when accessing
54 // `[u16; 128]` with an index
55 // `non_ascii as usize - 0x80usize`.
56 //
57 // Safety: `non_ascii` is a u8 byte >=0x80, from the invariants
58 // on Utf8Destination::copy_ascii_from_check_space_bmp()
59 let mapped =
60 unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
61 // let mapped = self.table[non_ascii as usize - 0x80usize];
62 if mapped == 0u16 {
63 return (
64 DecoderResult::Malformed(1, 0),
65 source.consumed(),
66 handle.written(),
67 );
68 }
69 let dest_again = handle.write_bmp_excl_ascii(mapped);
70 // End non-boilerplate
71 match source.check_available() {
72 Space::Full(src_consumed) => {
73 return (
74 DecoderResult::InputEmpty,
75 src_consumed,
76 dest_again.written(),
77 );
78 }
79 Space::Available(source_handle) => {
80 match dest_again.check_space_bmp() {
81 Space::Full(dst_written) => {
82 return (
83 DecoderResult::OutputFull,
84 source_handle.consumed(),
85 dst_written,
86 );
87 }
88 Space::Available(mut destination_handle) => {
89 let (mut b, unread_handle) = source_handle.read();
90 let source_again = unread_handle.commit();
91 'innermost: loop {
92 if b > 127 {
93 non_ascii = b;
94 handle = destination_handle;
95 continue 'middle;
96 }
97 // Testing on Haswell says that we should write the
98 // byte unconditionally instead of trying to unread it
99 // to make it part of the next SIMD stride.
100 let dest_again_again = destination_handle.write_ascii(b);
101 if b < 60 {
102 // We've got punctuation
103 match source_again.check_available() {
104 Space::Full(src_consumed_again) => {
105 return (
106 DecoderResult::InputEmpty,
107 src_consumed_again,
108 dest_again_again.written(),
109 );
110 }
111 Space::Available(source_handle_again) => {
112 match dest_again_again.check_space_bmp() {
113 Space::Full(dst_written_again) => {
114 return (
115 DecoderResult::OutputFull,
116 source_handle_again.consumed(),
117 dst_written_again,
118 );
119 }
120 Space::Available(
121 destination_handle_again,
122 ) => {
123 let (b_again, _unread_handle_again) =
124 source_handle_again.read();
125 b = b_again;
126 destination_handle =
127 destination_handle_again;
128 continue 'innermost;
129 }
130 }
131 }
132 }
133 }
134 // We've got markup or ASCII text
135 continue 'outermost;
136 }
137 }
138 }
139 }
140 }
141 },
142 }
143 }
144 }
145
146 pub fn decode_to_utf16_raw(
147 &mut self,
148 src: &[u8],
149 dst: &mut [u16],
150 _last: bool,
151 ) -> (DecoderResult, usize, usize) {
152 let (pending, length) = if dst.len() < src.len() {
153 (DecoderResult::OutputFull, dst.len())
154 } else {
155 (DecoderResult::InputEmpty, src.len())
156 };
157 // Safety invariant: converted <= length. Quite often we have `converted < length`
158 // which will be separately marked.
159 let mut converted = 0usize;
160 'outermost: loop {
161 match unsafe {
162 // Safety: length is the minimum length, `src/dst + x` will always be valid for reads/writes of `len - x`
163 ascii_to_basic_latin(
164 src.as_ptr().add(converted),
165 dst.as_mut_ptr().add(converted),
166 length - converted,
167 )
168 } {
169 None => {
170 return (pending, length, length);
171 }
172 Some((mut non_ascii, consumed)) => {
173 // Safety invariant: `converted <= length` upheld, since this can only consume
174 // up to `length - converted` bytes.
175 //
176 // Furthermore, in this context,
177 // we can assume `converted < length` since this branch is only ever hit when
178 // ascii_to_basic_latin fails to consume the entire slice
179 converted += consumed;
180 'middle: loop {
181 // `converted` doesn't count the reading of `non_ascii` yet.
182 // Since the non-ASCIIness of `non_ascii` is hidden from
183 // the optimizer, it can't figure out that it's OK to
184 // statically omit the bound check when accessing
185 // `[u16; 128]` with an index
186 // `non_ascii as usize - 0x80usize`.
187 //
188 // Safety: We can rely on `non_ascii` being between `0x80` and `0xFF` due to
189 // the invariants of `ascii_to_basic_latin()`, and our table has enough space for that.
190 let mapped =
191 unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
192 // let mapped = self.table[non_ascii as usize - 0x80usize];
193 if mapped == 0u16 {
194 return (
195 DecoderResult::Malformed(1, 0),
196 converted + 1, // +1 `for non_ascii`
197 converted,
198 );
199 }
200 unsafe {
201 // Safety: As mentioned above, `converted < length`
202 *(dst.get_unchecked_mut(converted)) = mapped;
203 }
204 // Safety: `converted <= length` upheld, since `converted < length` before this
205 converted += 1;
206 // Next, handle ASCII punctuation and non-ASCII without
207 // going back to ASCII acceleration. Non-ASCII scripts
208 // use ASCII punctuation, so this avoid going to
209 // acceleration just for punctuation/space and then
210 // failing. This is a significant boost to non-ASCII
211 // scripts.
212 // TODO: Split out Latin converters without this part
213 // this stuff makes Latin script-conversion slower.
214 if converted == length {
215 return (pending, length, length);
216 }
217 // Safety: We are back to `converted < length` because of the == above
218 // and can perform this check.
219 let mut b = unsafe { *(src.get_unchecked(converted)) };
220 // Safety: `converted < length` is upheld for this loop
221 'innermost: loop {
222 if b > 127 {
223 non_ascii = b;
224 continue 'middle;
225 }
226 // Testing on Haswell says that we should write the
227 // byte unconditionally instead of trying to unread it
228 // to make it part of the next SIMD stride.
229 unsafe {
230 // Safety: `converted < length` is true for this loop
231 *(dst.get_unchecked_mut(converted)) = u16::from(b);
232 }
233 // Safety: We are now at `converted <= length`. We should *not* `continue`
234 // the loop without reverifying
235 converted += 1;
236 if b < 60 {
237 // We've got punctuation
238 if converted == length {
239 return (pending, length, length);
240 }
241 // Safety: we're back to `converted <= length` because of the == above
242 b = unsafe { *(src.get_unchecked(converted)) };
243 // Safety: The loop continues as `converted < length`
244 continue 'innermost;
245 }
246 // We've got markup or ASCII text
247 continue 'outermost;
248 }
249 }
250 }
251 }
252 }
253 }
254
255 pub fn latin1_byte_compatible_up_to(&self, buffer: &[u8]) -> usize {
256 let mut bytes = buffer;
257 let mut total = 0;
258 loop {
259 if let Some((non_ascii, offset)) = validate_ascii(bytes) {
260 total += offset;
261 // Safety: We can rely on `non_ascii` being between `0x80` and `0xFF` due to
262 // the invariants of `ascii_to_basic_latin()`, and our table has enough space for that.
263 let mapped = unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
264 if mapped != u16::from(non_ascii) {
265 return total;
266 }
267 total += 1;
268 bytes = &bytes[offset + 1..];
269 } else {
270 return total;
271 }
272 }
273 }
274}
275
276pub struct SingleByteEncoder {
277 table: &'static [u16; 128],
278 run_bmp_offset: usize,
279 run_byte_offset: usize,
280 run_length: usize,
281}
282
283impl SingleByteEncoder {
284 pub fn new(
285 encoding: &'static Encoding,
286 data: &'static [u16; 128],
287 run_bmp_offset: u16,
288 run_byte_offset: u8,
289 run_length: u8,
290 ) -> Encoder {
291 Encoder::new(
292 encoding,
293 VariantEncoder::SingleByte(SingleByteEncoder {
294 table: data,
295 run_bmp_offset: run_bmp_offset as usize,
296 run_byte_offset: run_byte_offset as usize,
297 run_length: run_length as usize,
298 }),
299 )
300 }
301
302 pub fn max_buffer_length_from_utf16_without_replacement(
303 &self,
304 u16_length: usize,
305 ) -> Option<usize> {
306 Some(u16_length)
307 }
308
309 pub fn max_buffer_length_from_utf8_without_replacement(
310 &self,
311 byte_length: usize,
312 ) -> Option<usize> {
313 Some(byte_length)
314 }
315
316 #[inline(always)]
317 fn encode_u16(&self, code_unit: u16) -> Option<u8> {
318 // First, we see if the code unit falls into a run of consecutive
319 // code units that can be mapped by offset. This is very efficient
320 // for most non-Latin encodings as well as Latin1-ish encodings.
321 //
322 // For encodings that don't fit this pattern, the run (which may
323 // have the length of just one) just establishes the starting point
324 // for the next rule.
325 //
326 // Next, we do a forward linear search in the part of the index
327 // after the run. Even in non-Latin1-ish Latin encodings (except
328 // macintosh), the lower case letters are here.
329 //
330 // Next, we search the third quadrant up to the start of the run
331 // (upper case letters in Latin encodings except macintosh, in
332 // Greek and in KOI encodings) and then the second quadrant,
333 // except if the run stared before the third quadrant, we search
334 // the second quadrant up to the run.
335 //
336 // Last, we search the first quadrant, which has unused controls
337 // or punctuation in most encodings. This is bad for macintosh
338 // and IBM866, but those are rare.
339
340 // Run of consecutive units
341 let unit_as_usize = code_unit as usize;
342 let offset = unit_as_usize.wrapping_sub(self.run_bmp_offset);
343 if offset < self.run_length {
344 return Some((128 + self.run_byte_offset + offset) as u8);
345 }
346
347 // Search after the run
348 let tail_start = self.run_byte_offset + self.run_length;
349 if let Some(pos) = position(&self.table[tail_start..], code_unit) {
350 return Some((128 + tail_start + pos) as u8);
351 }
352
353 if self.run_byte_offset >= 64 {
354 // Search third quadrant before the run
355 if let Some(pos) = position(&self.table[64..self.run_byte_offset], code_unit) {
356 return Some(((128 + 64) + pos) as u8);
357 }
358
359 // Search second quadrant
360 if let Some(pos) = position(&self.table[32..64], code_unit) {
361 return Some(((128 + 32) + pos) as u8);
362 }
363 } else if let Some(pos) = position(&self.table[32..self.run_byte_offset], code_unit) {
364 // windows-1252, windows-874, ISO-8859-15 and ISO-8859-5
365 // Search second quadrant before the run
366 return Some(((128 + 32) + pos) as u8);
367 }
368
369 // Search first quadrant
370 if let Some(pos) = position(&self.table[..32], code_unit) {
371 return Some((128 + pos) as u8);
372 }
373
374 None
375 }
376
377 ascii_compatible_bmp_encoder_function!(
378 {
379 match self.encode_u16(bmp) {
380 Some(byte) => handle.write_one(byte),
381 None => {
382 return (
383 EncoderResult::unmappable_from_bmp(bmp),
384 source.consumed(),
385 handle.written(),
386 );
387 }
388 }
389 },
390 bmp,
391 self,
392 source,
393 handle,
394 copy_ascii_to_check_space_one,
395 check_space_one,
396 encode_from_utf8_raw,
397 str,
398 Utf8Source,
399 true
400 );
401
402 pub fn encode_from_utf16_raw(
403 &mut self,
404 src: &[u16],
405 dst: &mut [u8],
406 _last: bool,
407 ) -> (EncoderResult, usize, usize) {
408 let (pending, length) = if dst.len() < src.len() {
409 (EncoderResult::OutputFull, dst.len())
410 } else {
411 (EncoderResult::InputEmpty, src.len())
412 };
413 // Safety invariant: converted <= length. Quite often we have `converted < length`
414 // which will be separately marked.
415 let mut converted = 0usize;
416 'outermost: loop {
417 match unsafe {
418 // Safety: length is the minimum length, `src/dst + x` will always be valid for reads/writes of `len - x`
419 basic_latin_to_ascii(
420 src.as_ptr().add(converted),
421 dst.as_mut_ptr().add(converted),
422 length - converted,
423 )
424 } {
425 None => {
426 return (pending, length, length);
427 }
428 Some((mut non_ascii, consumed)) => {
429 // Safety invariant: `converted <= length` upheld, since this can only consume
430 // up to `length - converted` bytes.
431 //
432 // Furthermore, in this context,
433 // we can assume `converted < length` since this branch is only ever hit when
434 // ascii_to_basic_latin fails to consume the entire slice
435 converted += consumed;
436 'middle: loop {
437 // `converted` doesn't count the reading of `non_ascii` yet.
438 match self.encode_u16(non_ascii) {
439 Some(byte) => {
440 unsafe {
441 // Safety: we're allowed this access since `converted < length`
442 *(dst.get_unchecked_mut(converted)) = byte;
443 }
444 converted += 1;
445 // `converted <= length` now
446 }
447 None => {
448 // At this point, we need to know if we
449 // have a surrogate.
450 let high_bits = non_ascii & 0xFC00u16;
451 if high_bits == 0xD800u16 {
452 // high surrogate
453 if converted + 1 == length {
454 // End of buffer. This surrogate is unpaired.
455 return (
456 EncoderResult::Unmappable('\u{FFFD}'),
457 converted + 1, // +1 `for non_ascii`
458 converted,
459 );
460 }
461 // Safety: convered < length from outside the match, and `converted + 1 != length`,
462 // So `converted + 1 < length` as well. We're in bounds
463 let second =
464 u32::from(unsafe { *src.get_unchecked(converted + 1) });
465 if second & 0xFC00u32 != 0xDC00u32 {
466 return (
467 EncoderResult::Unmappable('\u{FFFD}'),
468 converted + 1, // +1 `for non_ascii`
469 converted,
470 );
471 }
472 // The next code unit is a low surrogate.
473 let astral: char = unsafe {
474 // Safety: We can rely on non_ascii being 0xD800-0xDBFF since the high bits are 0xD800
475 // Then, (non_ascii << 10 - 0xD800 << 10) becomes between (0 to 0x3FF) << 10, which is between
476 // 0x400 to 0xffc00. Adding the 0x10000 gives a range of 0x10400 to 0x10fc00. Subtracting the 0xDC00
477 // gives 0x2800 to 0x102000
478 // The second term is between 0xDC00 and 0xDFFF from the check above. This gives a maximum
479 // possible range of (0x10400 + 0xDC00) to (0x102000 + 0xDFFF) which is 0x1E000 to 0x10ffff.
480 // This is in range.
481 //
482 // From a Unicode principles perspective this can also be verified as we have checked that `non_ascii` is a high surrogate
483 // (0xD800..=0xDBFF), and that `second` is a low surrogate (`0xDC00..=0xDFFF`), and we are applying reverse of the UTC16 transformation
484 // algorithm <https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF>, by applying the high surrogate - 0xD800 to the
485 // high ten bits, and the low surrogate - 0xDc00 to the low ten bits, and then adding 0x10000
486 ::core::char::from_u32_unchecked(
487 (u32::from(non_ascii) << 10) + second
488 - (((0xD800u32 << 10) - 0x1_0000u32) + 0xDC00u32),
489 )
490 };
491 return (
492 EncoderResult::Unmappable(astral),
493 converted + 2, // +2 `for non_ascii` and `second`
494 converted,
495 );
496 }
497 if high_bits == 0xDC00u16 {
498 // Unpaired low surrogate
499 return (
500 EncoderResult::Unmappable('\u{FFFD}'),
501 converted + 1, // +1 `for non_ascii`
502 converted,
503 );
504 }
505 return (
506 EncoderResult::unmappable_from_bmp(non_ascii),
507 converted + 1, // +1 `for non_ascii`
508 converted,
509 );
510 // Safety: This branch diverges, so no need to uphold invariants on `converted`
511 }
512 }
513 // Next, handle ASCII punctuation and non-ASCII without
514 // going back to ASCII acceleration. Non-ASCII scripts
515 // use ASCII punctuation, so this avoid going to
516 // acceleration just for punctuation/space and then
517 // failing. This is a significant boost to non-ASCII
518 // scripts.
519 // TODO: Split out Latin converters without this part
520 // this stuff makes Latin script-conversion slower.
521 if converted == length {
522 return (pending, length, length);
523 }
524 // Safety: we're back to `converted < length` due to the == above and can perform
525 // the unchecked read
526 let mut unit = unsafe { *(src.get_unchecked(converted)) };
527 'innermost: loop {
528 // Safety: This loop always begins with `converted < length`, see
529 // the invariant outside and the comment on the continue below
530 if unit > 127 {
531 non_ascii = unit;
532 continue 'middle;
533 }
534 // Testing on Haswell says that we should write the
535 // byte unconditionally instead of trying to unread it
536 // to make it part of the next SIMD stride.
537 unsafe {
538 // Safety: Can rely on converted < length
539 *(dst.get_unchecked_mut(converted)) = unit as u8;
540 }
541 converted += 1;
542 // `converted <= length` here
543 if unit < 60 {
544 // We've got punctuation
545 if converted == length {
546 return (pending, length, length);
547 }
548 // Safety: `converted < length` due to the == above. The read is safe.
549 unit = unsafe { *(src.get_unchecked(converted)) };
550 // Safety: This only happens if `converted < length`, maintaining it
551 continue 'innermost;
552 }
553 // We've got markup or ASCII text
554 continue 'outermost;
555 // Safety: All other routes to here diverge so the continue is the only
556 // way to run the innermost loop.
557 }
558 }
559 }
560 }
561 }
562 }
563}
564
565// Any copyright to the test code below this comment is dedicated to the
566// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
567
568#[cfg(all(test, feature = "alloc"))]
569mod tests {
570 use super::super::testing::*;
571 use super::super::*;
572
573 #[test]
574 fn test_windows_1255_ca() {
575 decode(WINDOWS_1255, b"\xCA", "\u{05BA}");
576 encode(WINDOWS_1255, "\u{05BA}", b"\xCA");
577 }
578
579 #[test]
580 fn test_ascii_punctuation() {
581 let bytes = b"\xC1\xF5\xF4\xFC \xE5\xDF\xED\xE1\xE9 \xDD\xED\xE1 \xF4\xE5\xF3\xF4. \xC1\xF5\xF4\xFC \xE5\xDF\xED\xE1\xE9 \xDD\xED\xE1 \xF4\xE5\xF3\xF4.";
582 let characters = "\u{0391}\u{03C5}\u{03C4}\u{03CC} \
583 \u{03B5}\u{03AF}\u{03BD}\u{03B1}\u{03B9} \u{03AD}\u{03BD}\u{03B1} \
584 \u{03C4}\u{03B5}\u{03C3}\u{03C4}. \u{0391}\u{03C5}\u{03C4}\u{03CC} \
585 \u{03B5}\u{03AF}\u{03BD}\u{03B1}\u{03B9} \u{03AD}\u{03BD}\u{03B1} \
586 \u{03C4}\u{03B5}\u{03C3}\u{03C4}.";
587 decode(WINDOWS_1253, bytes, characters);
588 encode(WINDOWS_1253, characters, bytes);
589 }
590
591 #[test]
592 fn test_decode_malformed() {
593 decode(
594 WINDOWS_1253,
595 b"\xC1\xF5\xD2\xF4\xFC",
596 "\u{0391}\u{03C5}\u{FFFD}\u{03C4}\u{03CC}",
597 );
598 }
599
600 #[test]
601 fn test_encode_unmappables() {
602 encode(
603 WINDOWS_1253,
604 "\u{0391}\u{03C5}\u{2603}\u{03C4}\u{03CC}",
605 b"\xC1\xF5☃\xF4\xFC",
606 );
607 encode(
608 WINDOWS_1253,
609 "\u{0391}\u{03C5}\u{1F4A9}\u{03C4}\u{03CC}",
610 b"\xC1\xF5💩\xF4\xFC",
611 );
612 }
613
614 #[test]
615 fn test_encode_unpaired_surrogates() {
616 encode_from_utf16(
617 WINDOWS_1253,
618 &[0x0391u16, 0x03C5u16, 0xDCA9u16, 0x03C4u16, 0x03CCu16],
619 b"\xC1\xF5�\xF4\xFC",
620 );
621 encode_from_utf16(
622 WINDOWS_1253,
623 &[0x0391u16, 0x03C5u16, 0xD83Du16, 0x03C4u16, 0x03CCu16],
624 b"\xC1\xF5�\xF4\xFC",
625 );
626 encode_from_utf16(
627 WINDOWS_1253,
628 &[0x0391u16, 0x03C5u16, 0x03C4u16, 0x03CCu16, 0xD83Du16],
629 b"\xC1\xF5\xF4\xFC�",
630 );
631 }
632
633 pub const HIGH_BYTES: &'static [u8; 128] = &[
634 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E,
635 0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D,
636 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC,
637 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB,
638 0xBC, 0xBD, 0xBE, 0xBF, 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA,
639 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9,
640 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8,
641 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
642 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
643 ];
644
645 fn decode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) {
646 let mut with_replacement = [0u16; 128];
647 let mut it = data.iter().enumerate();
648 loop {
649 match it.next() {
650 Some((i, code_point)) => {
651 if *code_point == 0 {
652 with_replacement[i] = 0xFFFD;
653 } else {
654 with_replacement[i] = *code_point;
655 }
656 }
657 None => {
658 break;
659 }
660 }
661 }
662
663 decode_to_utf16(encoding, HIGH_BYTES, &with_replacement[..]);
664 }
665
666 fn encode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) {
667 let mut with_zeros = [0u8; 128];
668 let mut it = data.iter().enumerate();
669 loop {
670 match it.next() {
671 Some((i, code_point)) => {
672 if *code_point == 0 {
673 with_zeros[i] = 0;
674 } else {
675 with_zeros[i] = HIGH_BYTES[i];
676 }
677 }
678 None => {
679 break;
680 }
681 }
682 }
683
684 encode_from_utf16(encoding, data, &with_zeros[..]);
685 }
686
687 #[test]
688 fn test_single_byte_from_two_low_surrogates() {
689 let expectation = b"��";
690 let mut output = [0u8; 40];
691 let mut encoder = WINDOWS_1253.new_encoder();
692 let (result, read, written, had_errors) =
693 encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
694 assert_eq!(result, CoderResult::InputEmpty);
695 assert_eq!(read, 2);
696 assert_eq!(written, expectation.len());
697 assert!(had_errors);
698 assert_eq!(&output[..written], expectation);
699 }
700
701 // These tests are so self-referential that they are pretty useless.
702
703 // BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
704 // Instead, please regenerate using generate-encoding-data.py
705
706 #[test]
707 fn test_single_byte_decode() {
708 decode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866);
709 decode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10);
710 if cfg!(miri) {
711 // Miri is too slow
712 return;
713 }
714 decode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13);
715 decode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14);
716 decode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15);
717 decode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16);
718 decode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2);
719 decode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3);
720 decode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4);
721 decode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5);
722 decode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6);
723 decode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7);
724 decode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8);
725 decode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r);
726 decode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u);
727 decode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh);
728 decode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250);
729 decode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251);
730 decode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252);
731 decode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253);
732 decode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254);
733 decode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255);
734 decode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256);
735 decode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257);
736 decode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258);
737 decode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874);
738 decode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic);
739 }
740
741 #[test]
742 fn test_single_byte_encode() {
743 encode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866);
744 encode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10);
745 if cfg!(miri) {
746 // Miri is too slow
747 return;
748 }
749 encode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13);
750 encode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14);
751 encode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15);
752 encode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16);
753 encode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2);
754 encode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3);
755 encode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4);
756 encode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5);
757 encode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6);
758 encode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7);
759 encode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8);
760 encode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r);
761 encode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u);
762 encode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh);
763 encode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250);
764 encode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251);
765 encode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252);
766 encode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253);
767 encode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254);
768 encode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255);
769 encode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256);
770 encode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257);
771 encode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258);
772 encode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874);
773 encode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic);
774 }
775 // END GENERATED CODE
776}