stable_pattern/pattern.rs
1//! The string Pattern API.
2//!
3//! The Pattern API provides a generic mechanism for using different pattern
4//! types when searching through a string.
5//!
6//! For more details, see the traits [`Pattern`], [`Searcher`],
7//! [`ReverseSearcher`], and [`DoubleEndedSearcher`].
8//!
9//! Although this API is unstable, it is exposed via stable APIs on the
10//! [`str`] type.
11//!
12//! # Examples
13//!
14//! [`Pattern`] is [implemented][pattern-impls] in the stable API for
15//! [`&str`][`str`], [`char`], slices of [`char`], and functions and closures
16//! implementing `FnMut(char) -> bool`.
17//!
18//! ```
19//! let s = "Can you find a needle in a haystack?";
20//!
21//! // &str pattern
22//! assert_eq!(s.find("you"), Some(4));
23//! // char pattern
24//! assert_eq!(s.find('n'), Some(2));
25//! // slice of chars pattern
26//! assert_eq!(s.find(&['a', 'e', 'i', 'o', 'u'][..]), Some(1));
27//! // closure pattern
28//! assert_eq!(s.find(|c: char| c.is_ascii_punctuation()), Some(35));
29//! ```
30//!
31//! [pattern-impls]: Pattern#implementors
32
33use core::cmp;
34use core::fmt;
35
36// Pattern
37
38/// A string pattern.
39///
40/// A `Pattern<'a>` expresses that the implementing type
41/// can be used as a string pattern for searching in a [`&'a str`][str].
42///
43/// For example, both `'a'` and `"aa"` are patterns that
44/// would match at index `1` in the string `"baaaab"`.
45///
46/// The trait itself acts as a builder for an associated
47/// [`Searcher`] type, which does the actual work of finding
48/// occurrences of the pattern in a string.
49///
50/// Depending on the type of the pattern, the behaviour of methods like
51/// [`str::find`] and [`str::contains`] can change. The table below describes
52/// some of those behaviours.
53///
54/// | Pattern type | Match condition |
55/// |--------------------------|-------------------------------------------|
56/// | `&str` | is substring |
57/// | `char` | is contained in string |
58/// | `&[char]` | any char in slice is contained in string |
59/// | `F: FnMut(char) -> bool` | `F` returns `true` for a char in string |
60/// | `&&str` | is substring |
61/// | `&String` | is substring |
62///
63/// # Examples
64///
65/// ```
66/// // &str
67/// assert_eq!("abaaa".find("ba"), Some(1));
68/// assert_eq!("abaaa".find("bac"), None);
69///
70/// // char
71/// assert_eq!("abaaa".find('a'), Some(0));
72/// assert_eq!("abaaa".find('b'), Some(1));
73/// assert_eq!("abaaa".find('c'), None);
74///
75/// // &[char]
76/// assert_eq!("ab".find(&['b', 'a'][..]), Some(0));
77/// assert_eq!("abaaa".find(&['a', 'z'][..]), Some(0));
78/// assert_eq!("abaaa".find(&['c', 'd'][..]), None);
79///
80/// // FnMut(char) -> bool
81/// assert_eq!("abcdef_z".find(|ch| ch > 'd' && ch < 'y'), Some(4));
82/// assert_eq!("abcddd_z".find(|ch| ch > 'd' && ch < 'y'), None);
83/// ```
84pub trait Pattern<'a>: Sized {
85 /// Associated searcher for this pattern
86 type Searcher: Searcher<'a>;
87
88 /// Constructs the associated searcher from
89 /// `self` and the `haystack` to search in.
90 fn into_searcher(self, haystack: &'a str) -> Self::Searcher;
91
92 /// Checks whether the pattern matches anywhere in the haystack
93 #[inline]
94 fn is_contained_in(self, haystack: &'a str) -> bool {
95 self.into_searcher(haystack).next_match().is_some()
96 }
97
98 /// Checks whether the pattern matches at the front of the haystack
99 #[inline]
100 fn is_prefix_of(self, haystack: &'a str) -> bool {
101 matches!(self.into_searcher(haystack).next(), SearchStep::Match(0, _))
102 }
103
104 /// Checks whether the pattern matches at the back of the haystack
105 #[inline]
106 fn is_suffix_of(self, haystack: &'a str) -> bool
107 where
108 Self::Searcher: ReverseSearcher<'a>,
109 {
110 matches!(self.into_searcher(haystack).next_back(), SearchStep::Match(_, j) if haystack.len() == j)
111 }
112
113 /// Removes the pattern from the front of haystack, if it matches.
114 #[inline]
115 fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> {
116 if let SearchStep::Match(start, len) = self.into_searcher(haystack).next() {
117 debug_assert_eq!(
118 start, 0,
119 "The first search step from Searcher \
120 must include the first character"
121 );
122 // SAFETY: `Searcher` is known to return valid indices.
123 unsafe { Some(haystack.get_unchecked(len..)) }
124 } else {
125 None
126 }
127 }
128
129 /// Removes the pattern from the back of haystack, if it matches.
130 #[inline]
131 fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str>
132 where
133 Self::Searcher: ReverseSearcher<'a>,
134 {
135 if let SearchStep::Match(start, end) = self.into_searcher(haystack).next_back() {
136 debug_assert_eq!(
137 end,
138 haystack.len(),
139 "The first search step from ReverseSearcher \
140 must include the last character"
141 );
142 // SAFETY: `Searcher` is known to return valid indices.
143 unsafe { Some(haystack.get_unchecked(..start)) }
144 } else {
145 None
146 }
147 }
148}
149
150// Searcher
151
152/// Result of calling [`Searcher::next()`] or [`ReverseSearcher::next_back()`].
153#[derive(Copy, Clone, Eq, PartialEq, Debug)]
154pub enum SearchStep {
155 /// Expresses that a match of the pattern has been found at
156 /// `haystack[a..b]`.
157 Match(usize, usize),
158 /// Expresses that `haystack[a..b]` has been rejected as a possible match
159 /// of the pattern.
160 ///
161 /// Note that there might be more than one `Reject` between two `Match`es,
162 /// there is no requirement for them to be combined into one.
163 Reject(usize, usize),
164 /// Expresses that every byte of the haystack has been visited, ending
165 /// the iteration.
166 Done,
167}
168
169/// A searcher for a string pattern.
170///
171/// This trait provides methods for searching for non-overlapping
172/// matches of a pattern starting from the front (left) of a string.
173///
174/// It will be implemented by associated `Searcher`
175/// types of the [`Pattern`] trait.
176///
177/// The trait is marked unsafe because the indices returned by the
178/// [`next()`][Searcher::next] methods are required to lie on valid utf8
179/// boundaries in the haystack. This enables consumers of this trait to
180/// slice the haystack without additional runtime checks.
181pub unsafe trait Searcher<'a> {
182 /// Getter for the underlying string to be searched in
183 ///
184 /// Will always return the same [`&str`][str].
185 fn haystack(&self) -> &'a str;
186
187 /// Performs the next search step starting from the front.
188 ///
189 /// - Returns [`Match(a, b)`][SearchStep::Match] if `haystack[a..b]` matches
190 /// the pattern.
191 /// - Returns [`Reject(a, b)`][SearchStep::Reject] if `haystack[a..b]` can
192 /// not match the pattern, even partially.
193 /// - Returns [`Done`][SearchStep::Done] if every byte of the haystack has
194 /// been visited.
195 ///
196 /// The stream of [`Match`][SearchStep::Match] and
197 /// [`Reject`][SearchStep::Reject] values up to a [`Done`][SearchStep::Done]
198 /// will contain index ranges that are adjacent, non-overlapping,
199 /// covering the whole haystack, and laying on utf8 boundaries.
200 ///
201 /// A [`Match`][SearchStep::Match] result needs to contain the whole matched
202 /// pattern, however [`Reject`][SearchStep::Reject] results may be split up
203 /// into arbitrary many adjacent fragments. Both ranges may have zero length.
204 ///
205 /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"`
206 /// might produce the stream
207 /// `[Reject(0, 1), Reject(1, 2), Match(2, 5), Reject(5, 8)]`
208 fn next(&mut self) -> SearchStep;
209
210 /// Finds the next [`Match`][SearchStep::Match] result. See [`next()`][Searcher::next].
211 ///
212 /// Unlike [`next()`][Searcher::next], there is no guarantee that the returned ranges
213 /// of this and [`next_reject`][Searcher::next_reject] will overlap. This will return
214 /// `(start_match, end_match)`, where start_match is the index of where
215 /// the match begins, and end_match is the index after the end of the match.
216 #[inline]
217 fn next_match(&mut self) -> Option<(usize, usize)> {
218 loop {
219 match self.next() {
220 SearchStep::Match(a, b) => return Some((a, b)),
221 SearchStep::Done => return None,
222 _ => continue,
223 }
224 }
225 }
226
227 /// Finds the next [`Reject`][SearchStep::Reject] result. See [`next()`][Searcher::next]
228 /// and [`next_match()`][Searcher::next_match].
229 ///
230 /// Unlike [`next()`][Searcher::next], there is no guarantee that the returned ranges
231 /// of this and [`next_match`][Searcher::next_match] will overlap.
232 #[inline]
233 fn next_reject(&mut self) -> Option<(usize, usize)> {
234 loop {
235 match self.next() {
236 SearchStep::Reject(a, b) => return Some((a, b)),
237 SearchStep::Done => return None,
238 _ => continue,
239 }
240 }
241 }
242}
243
244/// A reverse searcher for a string pattern.
245///
246/// This trait provides methods for searching for non-overlapping
247/// matches of a pattern starting from the back (right) of a string.
248///
249/// It will be implemented by associated [`Searcher`]
250/// types of the [`Pattern`] trait if the pattern supports searching
251/// for it from the back.
252///
253/// The index ranges returned by this trait are not required
254/// to exactly match those of the forward search in reverse.
255///
256/// For the reason why this trait is marked unsafe, see them
257/// parent trait [`Searcher`].
258pub unsafe trait ReverseSearcher<'a>: Searcher<'a> {
259 /// Performs the next search step starting from the back.
260 ///
261 /// - Returns [`Match(a, b)`][SearchStep::Match] if `haystack[a..b]`
262 /// matches the pattern.
263 /// - Returns [`Reject(a, b)`][SearchStep::Reject] if `haystack[a..b]`
264 /// can not match the pattern, even partially.
265 /// - Returns [`Done`][SearchStep::Done] if every byte of the haystack
266 /// has been visited
267 ///
268 /// The stream of [`Match`][SearchStep::Match] and
269 /// [`Reject`][SearchStep::Reject] values up to a [`Done`][SearchStep::Done]
270 /// will contain index ranges that are adjacent, non-overlapping,
271 /// covering the whole haystack, and laying on utf8 boundaries.
272 ///
273 /// A [`Match`][SearchStep::Match] result needs to contain the whole matched
274 /// pattern, however [`Reject`][SearchStep::Reject] results may be split up
275 /// into arbitrary many adjacent fragments. Both ranges may have zero length.
276 ///
277 /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"`
278 /// might produce the stream
279 /// `[Reject(7, 8), Match(4, 7), Reject(1, 4), Reject(0, 1)]`.
280 fn next_back(&mut self) -> SearchStep;
281
282 /// Finds the next [`Match`][SearchStep::Match] result.
283 /// See [`next_back()`][ReverseSearcher::next_back].
284 #[inline]
285 fn next_match_back(&mut self) -> Option<(usize, usize)> {
286 loop {
287 match self.next_back() {
288 SearchStep::Match(a, b) => return Some((a, b)),
289 SearchStep::Done => return None,
290 _ => continue,
291 }
292 }
293 }
294
295 /// Finds the next [`Reject`][SearchStep::Reject] result.
296 /// See [`next_back()`][ReverseSearcher::next_back].
297 #[inline]
298 fn next_reject_back(&mut self) -> Option<(usize, usize)> {
299 loop {
300 match self.next_back() {
301 SearchStep::Reject(a, b) => return Some((a, b)),
302 SearchStep::Done => return None,
303 _ => continue,
304 }
305 }
306 }
307}
308
309/// A marker trait to express that a [`ReverseSearcher`]
310/// can be used for a [`DoubleEndedIterator`] implementation.
311///
312/// For this, the impl of [`Searcher`] and [`ReverseSearcher`] need
313/// to follow these conditions:
314///
315/// - All results of `next()` need to be identical
316/// to the results of `next_back()` in reverse order.
317/// - `next()` and `next_back()` need to behave as
318/// the two ends of a range of values, that is they
319/// can not "walk past each other".
320///
321/// # Examples
322///
323/// `char::Searcher` is a `DoubleEndedSearcher` because searching for a
324/// [`char`] only requires looking at one at a time, which behaves the same
325/// from both ends.
326///
327/// `(&str)::Searcher` is not a `DoubleEndedSearcher` because
328/// the pattern `"aa"` in the haystack `"aaa"` matches as either
329/// `"[aa]a"` or `"a[aa]"`, depending from which side it is searched.
330pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {}
331
332/////////////////////////////////////////////////////////////////////////////
333// Impl for char
334/////////////////////////////////////////////////////////////////////////////
335
336/// Associated type for `<char as Pattern<'a>>::Searcher`.
337#[derive(Clone, Debug)]
338pub struct CharSearcher<'a> {
339 haystack: &'a str,
340 // safety invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack`
341 // This invariant can be broken *within* next_match and next_match_back, however
342 // they must exit with fingers on valid code point boundaries.
343 /// `finger` is the current byte index of the forward search.
344 /// Imagine that it exists before the byte at its index, i.e.
345 /// `haystack[finger]` is the first byte of the slice we must inspect during
346 /// forward searching
347 finger: usize,
348 /// `finger_back` is the current byte index of the reverse search.
349 /// Imagine that it exists after the byte at its index, i.e.
350 /// haystack[finger_back - 1] is the last byte of the slice we must inspect during
351 /// forward searching (and thus the first byte to be inspected when calling next_back()).
352 finger_back: usize,
353 /// The character being searched for
354 needle: char,
355
356 // safety invariant: `utf8_size` must be less than 5
357 /// The number of bytes `needle` takes up when encoded in utf8.
358 utf8_size: usize,
359 /// A utf8 encoded copy of the `needle`
360 utf8_encoded: [u8; 4],
361}
362
363unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
364 #[inline]
365 fn haystack(&self) -> &'a str {
366 self.haystack
367 }
368 #[inline]
369 fn next(&mut self) -> SearchStep {
370 let old_finger = self.finger;
371 // SAFETY: 1-4 guarantee safety of `get_unchecked`
372 // 1. `self.finger` and `self.finger_back` are kept on unicode boundaries
373 // (this is invariant)
374 // 2. `self.finger >= 0` since it starts at 0 and only increases
375 // 3. `self.finger < self.finger_back` because otherwise the char `iter`
376 // would return `SearchStep::Done`
377 // 4. `self.finger` comes before the end of the haystack because `self.finger_back`
378 // starts at the end and only decreases
379 let slice = unsafe { self.haystack.get_unchecked(old_finger..self.finger_back) };
380 let mut iter = slice.chars();
381 let old_len = iter.as_str().len();
382 if let Some(ch) = iter.next() {
383 // add byte offset of current character
384 // without re-encoding as utf-8
385 self.finger += old_len - iter.as_str().len();
386 if ch == self.needle {
387 SearchStep::Match(old_finger, self.finger)
388 } else {
389 SearchStep::Reject(old_finger, self.finger)
390 }
391 } else {
392 SearchStep::Done
393 }
394 }
395 #[inline]
396 fn next_match(&mut self) -> Option<(usize, usize)> {
397 loop {
398 // get the haystack after the last character found
399 let bytes = self.haystack.as_bytes().get(self.finger..self.finger_back)?;
400 // the last byte of the utf8 encoded needle
401 // SAFETY: we have an invariant that `utf8_size < 5`
402 let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
403 if let Some(index) = memchr::memchr(last_byte, bytes) {
404 // The new finger is the index of the byte we found,
405 // plus one, since we memchr'd for the last byte of the character.
406 //
407 // Note that this doesn't always give us a finger on a UTF8 boundary.
408 // If we *didn't* find our character
409 // we may have indexed to the non-last byte of a 3-byte or 4-byte character.
410 // We can't just skip to the next valid starting byte because a character like
411 // ꁁ (U+A041 YI SYLLABLE PA), utf-8 `EA 81 81` will have us always find
412 // the second byte when searching for the third.
413 //
414 // However, this is totally okay. While we have the invariant that
415 // self.finger is on a UTF8 boundary, this invariant is not relied upon
416 // within this method (it is relied upon in CharSearcher::next()).
417 //
418 // We only exit this method when we reach the end of the string, or if we
419 // find something. When we find something the `finger` will be set
420 // to a UTF8 boundary.
421 self.finger += index + 1;
422 if self.finger >= self.utf8_size {
423 let found_char = self.finger - self.utf8_size;
424 if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) {
425 if slice == &self.utf8_encoded[0..self.utf8_size] {
426 return Some((found_char, self.finger));
427 }
428 }
429 }
430 } else {
431 // found nothing, exit
432 self.finger = self.finger_back;
433 return None;
434 }
435 }
436 }
437
438 // let next_reject use the default implementation from the Searcher trait
439}
440
441unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
442 #[inline]
443 fn next_back(&mut self) -> SearchStep {
444 let old_finger = self.finger_back;
445 // SAFETY: see the comment for next() above
446 let slice = unsafe { self.haystack.get_unchecked(self.finger..old_finger) };
447 let mut iter = slice.chars();
448 let old_len = iter.as_str().len();
449 if let Some(ch) = iter.next_back() {
450 // subtract byte offset of current character
451 // without re-encoding as utf-8
452 self.finger_back -= old_len - iter.as_str().len();
453 if ch == self.needle {
454 SearchStep::Match(self.finger_back, old_finger)
455 } else {
456 SearchStep::Reject(self.finger_back, old_finger)
457 }
458 } else {
459 SearchStep::Done
460 }
461 }
462 #[inline]
463 fn next_match_back(&mut self) -> Option<(usize, usize)> {
464 let haystack = self.haystack.as_bytes();
465 loop {
466 // get the haystack up to but not including the last character searched
467 let bytes = haystack.get(self.finger..self.finger_back)?;
468 // the last byte of the utf8 encoded needle
469 // SAFETY: we have an invariant that `utf8_size < 5`
470 let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
471 if let Some(index) = memchr::memrchr(last_byte, bytes) {
472 // we searched a slice that was offset by self.finger,
473 // add self.finger to recoup the original index
474 let index = self.finger + index;
475 // memrchr will return the index of the byte we wish to
476 // find. In case of an ASCII character, this is indeed
477 // were we wish our new finger to be ("after" the found
478 // char in the paradigm of reverse iteration). For
479 // multibyte chars we need to skip down by the number of more
480 // bytes they have than ASCII
481 let shift = self.utf8_size - 1;
482 if index >= shift {
483 let found_char = index - shift;
484 if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) {
485 if slice == &self.utf8_encoded[0..self.utf8_size] {
486 // move finger to before the character found (i.e., at its start index)
487 self.finger_back = found_char;
488 return Some((self.finger_back, self.finger_back + self.utf8_size));
489 }
490 }
491 }
492 // We can't use finger_back = index - size + 1 here. If we found the last char
493 // of a different-sized character (or the middle byte of a different character)
494 // we need to bump the finger_back down to `index`. This similarly makes
495 // `finger_back` have the potential to no longer be on a boundary,
496 // but this is OK since we only exit this function on a boundary
497 // or when the haystack has been searched completely.
498 //
499 // Unlike next_match this does not
500 // have the problem of repeated bytes in utf-8 because
501 // we're searching for the last byte, and we can only have
502 // found the last byte when searching in reverse.
503 self.finger_back = index;
504 } else {
505 self.finger_back = self.finger;
506 // found nothing, exit
507 return None;
508 }
509 }
510 }
511
512 // let next_reject_back use the default implementation from the Searcher trait
513}
514
515impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {}
516
517/// Searches for chars that are equal to a given [`char`].
518///
519/// # Examples
520///
521/// ```
522/// assert_eq!("Hello world".find('o'), Some(4));
523/// ```
524impl<'a> Pattern<'a> for char {
525 type Searcher = CharSearcher<'a>;
526
527 #[inline]
528 fn into_searcher(self, haystack: &'a str) -> Self::Searcher {
529 let mut utf8_encoded = [0; 4];
530 let utf8_size = self.encode_utf8(&mut utf8_encoded).len();
531 CharSearcher {
532 haystack,
533 finger: 0,
534 finger_back: haystack.len(),
535 needle: self,
536 utf8_size,
537 utf8_encoded,
538 }
539 }
540
541 #[inline]
542 fn is_contained_in(self, haystack: &'a str) -> bool {
543 if (self as u32) < 128 {
544 haystack.as_bytes().contains(&(self as u8))
545 } else {
546 let mut buffer = [0u8; 4];
547 self.encode_utf8(&mut buffer).is_contained_in(haystack)
548 }
549 }
550
551 #[inline]
552 fn is_prefix_of(self, haystack: &'a str) -> bool {
553 self.encode_utf8(&mut [0u8; 4]).is_prefix_of(haystack)
554 }
555
556 #[inline]
557 fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> {
558 self.encode_utf8(&mut [0u8; 4]).strip_prefix_of(haystack)
559 }
560
561 #[inline]
562 fn is_suffix_of(self, haystack: &'a str) -> bool
563 where
564 Self::Searcher: ReverseSearcher<'a>,
565 {
566 self.encode_utf8(&mut [0u8; 4]).is_suffix_of(haystack)
567 }
568
569 #[inline]
570 fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str>
571 where
572 Self::Searcher: ReverseSearcher<'a>,
573 {
574 self.encode_utf8(&mut [0u8; 4]).strip_suffix_of(haystack)
575 }
576}
577
578/////////////////////////////////////////////////////////////////////////////
579// Impl for a MultiCharEq wrapper
580/////////////////////////////////////////////////////////////////////////////
581
582#[doc(hidden)]
583trait MultiCharEq {
584 fn matches(&mut self, c: char) -> bool;
585}
586
587impl<F> MultiCharEq for F
588where
589 F: FnMut(char) -> bool,
590{
591 #[inline]
592 fn matches(&mut self, c: char) -> bool {
593 (*self)(c)
594 }
595}
596
597impl MultiCharEq for &[char] {
598 #[inline]
599 fn matches(&mut self, c: char) -> bool {
600 self.iter().any(|&m| m == c)
601 }
602}
603
604struct MultiCharEqPattern<C: MultiCharEq>(C);
605
606#[derive(Clone, Debug)]
607struct MultiCharEqSearcher<'a, C: MultiCharEq> {
608 char_eq: C,
609 haystack: &'a str,
610 char_indices: core::str::CharIndices<'a>,
611}
612
613impl<'a, C: MultiCharEq> Pattern<'a> for MultiCharEqPattern<C> {
614 type Searcher = MultiCharEqSearcher<'a, C>;
615
616 #[inline]
617 fn into_searcher(self, haystack: &'a str) -> MultiCharEqSearcher<'a, C> {
618 MultiCharEqSearcher { haystack, char_eq: self.0, char_indices: haystack.char_indices() }
619 }
620}
621
622unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> {
623 #[inline]
624 fn haystack(&self) -> &'a str {
625 self.haystack
626 }
627
628 #[inline]
629 fn next(&mut self) -> SearchStep {
630 let s = &mut self.char_indices;
631 // Compare lengths of the internal byte slice iterator
632 // to find length of current char
633 let pre_len = s.as_str().len();
634 if let Some((i, c)) = s.next() {
635 let len = s.as_str().len();
636 let char_len = pre_len - len;
637 if self.char_eq.matches(c) {
638 return SearchStep::Match(i, i + char_len);
639 } else {
640 return SearchStep::Reject(i, i + char_len);
641 }
642 }
643 SearchStep::Done
644 }
645}
646
647unsafe impl<'a, C: MultiCharEq> ReverseSearcher<'a> for MultiCharEqSearcher<'a, C> {
648 #[inline]
649 fn next_back(&mut self) -> SearchStep {
650 let s = &mut self.char_indices;
651 // Compare lengths of the internal byte slice iterator
652 // to find length of current char
653 let pre_len = s.as_str().len();
654 if let Some((i, c)) = s.next_back() {
655 let len = s.as_str().len();
656 let char_len = pre_len - len;
657 if self.char_eq.matches(c) {
658 return SearchStep::Match(i, i + char_len);
659 } else {
660 return SearchStep::Reject(i, i + char_len);
661 }
662 }
663 SearchStep::Done
664 }
665}
666
667impl<'a, C: MultiCharEq> DoubleEndedSearcher<'a> for MultiCharEqSearcher<'a, C> {}
668
669/////////////////////////////////////////////////////////////////////////////
670
671macro_rules! pattern_methods {
672 ($t:ty, $pmap:expr, $smap:expr) => {
673 type Searcher = $t;
674
675 #[inline]
676 fn into_searcher(self, haystack: &'a str) -> $t {
677 ($smap)(($pmap)(self).into_searcher(haystack))
678 }
679
680 #[inline]
681 fn is_contained_in(self, haystack: &'a str) -> bool {
682 ($pmap)(self).is_contained_in(haystack)
683 }
684
685 #[inline]
686 fn is_prefix_of(self, haystack: &'a str) -> bool {
687 ($pmap)(self).is_prefix_of(haystack)
688 }
689
690 #[inline]
691 fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> {
692 ($pmap)(self).strip_prefix_of(haystack)
693 }
694
695 #[inline]
696 fn is_suffix_of(self, haystack: &'a str) -> bool
697 where
698 $t: ReverseSearcher<'a>,
699 {
700 ($pmap)(self).is_suffix_of(haystack)
701 }
702
703 #[inline]
704 fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str>
705 where
706 $t: ReverseSearcher<'a>,
707 {
708 ($pmap)(self).strip_suffix_of(haystack)
709 }
710 };
711}
712
713macro_rules! searcher_methods {
714 (forward) => {
715 #[inline]
716 fn haystack(&self) -> &'a str {
717 self.0.haystack()
718 }
719 #[inline]
720 fn next(&mut self) -> SearchStep {
721 self.0.next()
722 }
723 #[inline]
724 fn next_match(&mut self) -> Option<(usize, usize)> {
725 self.0.next_match()
726 }
727 #[inline]
728 fn next_reject(&mut self) -> Option<(usize, usize)> {
729 self.0.next_reject()
730 }
731 };
732 (reverse) => {
733 #[inline]
734 fn next_back(&mut self) -> SearchStep {
735 self.0.next_back()
736 }
737 #[inline]
738 fn next_match_back(&mut self) -> Option<(usize, usize)> {
739 self.0.next_match_back()
740 }
741 #[inline]
742 fn next_reject_back(&mut self) -> Option<(usize, usize)> {
743 self.0.next_reject_back()
744 }
745 };
746}
747
748/////////////////////////////////////////////////////////////////////////////
749// Impl for &[char]
750/////////////////////////////////////////////////////////////////////////////
751
752// Todo: Change / Remove due to ambiguity in meaning.
753
754/// Associated type for `<&[char] as Pattern<'a>>::Searcher`.
755#[derive(Clone, Debug)]
756pub struct CharSliceSearcher<'a, 'b>(<MultiCharEqPattern<&'b [char]> as Pattern<'a>>::Searcher);
757
758unsafe impl<'a, 'b> Searcher<'a> for CharSliceSearcher<'a, 'b> {
759 searcher_methods!(forward);
760}
761
762unsafe impl<'a, 'b> ReverseSearcher<'a> for CharSliceSearcher<'a, 'b> {
763 searcher_methods!(reverse);
764}
765
766impl<'a, 'b> DoubleEndedSearcher<'a> for CharSliceSearcher<'a, 'b> {}
767
768/// Searches for chars that are equal to any of the [`char`]s in the slice.
769///
770/// # Examples
771///
772/// ```
773/// assert_eq!("Hello world".find(&['l', 'l'] as &[_]), Some(2));
774/// assert_eq!("Hello world".find(&['l', 'l'][..]), Some(2));
775/// ```
776impl<'a, 'b> Pattern<'a> for &'b [char] {
777 pattern_methods!(CharSliceSearcher<'a, 'b>, MultiCharEqPattern, CharSliceSearcher);
778}
779
780/////////////////////////////////////////////////////////////////////////////
781// Impl for F: FnMut(char) -> bool
782/////////////////////////////////////////////////////////////////////////////
783
784/// Associated type for `<F as Pattern<'a>>::Searcher`.
785#[derive(Clone)]
786pub struct CharPredicateSearcher<'a, F>(<MultiCharEqPattern<F> as Pattern<'a>>::Searcher)
787where
788 F: FnMut(char) -> bool;
789
790impl<F> fmt::Debug for CharPredicateSearcher<'_, F>
791where
792 F: FnMut(char) -> bool,
793{
794 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
795 f.debug_struct("CharPredicateSearcher")
796 .field("haystack", &self.0.haystack)
797 .field("char_indices", &self.0.char_indices)
798 .finish()
799 }
800}
801unsafe impl<'a, F> Searcher<'a> for CharPredicateSearcher<'a, F>
802where
803 F: FnMut(char) -> bool,
804{
805 searcher_methods!(forward);
806}
807
808unsafe impl<'a, F> ReverseSearcher<'a> for CharPredicateSearcher<'a, F>
809where
810 F: FnMut(char) -> bool,
811{
812 searcher_methods!(reverse);
813}
814
815impl<'a, F> DoubleEndedSearcher<'a> for CharPredicateSearcher<'a, F> where F: FnMut(char) -> bool {}
816
817/// Searches for [`char`]s that match the given predicate.
818///
819/// # Examples
820///
821/// ```
822/// assert_eq!("Hello world".find(char::is_uppercase), Some(0));
823/// assert_eq!("Hello world".find(|c| "aeiou".contains(c)), Some(1));
824/// ```
825impl<'a, F> Pattern<'a> for F
826where
827 F: FnMut(char) -> bool,
828{
829 pattern_methods!(CharPredicateSearcher<'a, F>, MultiCharEqPattern, CharPredicateSearcher);
830}
831
832/////////////////////////////////////////////////////////////////////////////
833// Impl for &&str
834/////////////////////////////////////////////////////////////////////////////
835
836/// Delegates to the `&str` impl.
837impl<'a, 'b, 'c> Pattern<'a> for &'c &'b str {
838 pattern_methods!(StrSearcher<'a, 'b>, |&s| s, |s| s);
839}
840
841/////////////////////////////////////////////////////////////////////////////
842// Impl for &str
843/////////////////////////////////////////////////////////////////////////////
844
845/// Non-allocating substring search.
846///
847/// Will handle the pattern `""` as returning empty matches at each character
848/// boundary.
849///
850/// # Examples
851///
852/// ```
853/// assert_eq!("Hello world".find("world"), Some(6));
854/// ```
855impl<'a, 'b> Pattern<'a> for &'b str {
856 type Searcher = StrSearcher<'a, 'b>;
857
858 #[inline]
859 fn into_searcher(self, haystack: &'a str) -> StrSearcher<'a, 'b> {
860 StrSearcher::new(haystack, self)
861 }
862
863 /// Checks whether the pattern matches at the front of the haystack.
864 #[inline]
865 fn is_prefix_of(self, haystack: &'a str) -> bool {
866 haystack.as_bytes().starts_with(self.as_bytes())
867 }
868
869 /// Removes the pattern from the front of haystack, if it matches.
870 #[inline]
871 fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> {
872 if self.is_prefix_of(haystack) {
873 // SAFETY: prefix was just verified to exist.
874 unsafe { Some(haystack.get_unchecked(self.as_bytes().len()..)) }
875 } else {
876 None
877 }
878 }
879
880 /// Checks whether the pattern matches at the back of the haystack.
881 #[inline]
882 fn is_suffix_of(self, haystack: &'a str) -> bool {
883 haystack.as_bytes().ends_with(self.as_bytes())
884 }
885
886 /// Removes the pattern from the back of haystack, if it matches.
887 #[inline]
888 fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> {
889 if self.is_suffix_of(haystack) {
890 let i = haystack.len() - self.as_bytes().len();
891 // SAFETY: suffix was just verified to exist.
892 unsafe { Some(haystack.get_unchecked(..i)) }
893 } else {
894 None
895 }
896 }
897}
898
899/////////////////////////////////////////////////////////////////////////////
900// Two Way substring searcher
901/////////////////////////////////////////////////////////////////////////////
902
903#[derive(Clone, Debug)]
904/// Associated type for `<&str as Pattern<'a>>::Searcher`.
905pub struct StrSearcher<'a, 'b> {
906 haystack: &'a str,
907 needle: &'b str,
908
909 searcher: StrSearcherImpl,
910}
911
912#[derive(Clone, Debug)]
913enum StrSearcherImpl {
914 Empty(EmptyNeedle),
915 TwoWay(TwoWaySearcher),
916}
917
918#[derive(Clone, Debug)]
919struct EmptyNeedle {
920 position: usize,
921 end: usize,
922 is_match_fw: bool,
923 is_match_bw: bool,
924}
925
926impl<'a, 'b> StrSearcher<'a, 'b> {
927 fn new(haystack: &'a str, needle: &'b str) -> StrSearcher<'a, 'b> {
928 if needle.is_empty() {
929 StrSearcher {
930 haystack,
931 needle,
932 searcher: StrSearcherImpl::Empty(EmptyNeedle {
933 position: 0,
934 end: haystack.len(),
935 is_match_fw: true,
936 is_match_bw: true,
937 }),
938 }
939 } else {
940 StrSearcher {
941 haystack,
942 needle,
943 searcher: StrSearcherImpl::TwoWay(TwoWaySearcher::new(
944 needle.as_bytes(),
945 haystack.len(),
946 )),
947 }
948 }
949 }
950}
951
952unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> {
953 #[inline]
954 fn haystack(&self) -> &'a str {
955 self.haystack
956 }
957
958 #[inline]
959 fn next(&mut self) -> SearchStep {
960 match self.searcher {
961 StrSearcherImpl::Empty(ref mut searcher) => {
962 // empty needle rejects every char and matches every empty string between them
963 let is_match = searcher.is_match_fw;
964 searcher.is_match_fw = !searcher.is_match_fw;
965 let pos = searcher.position;
966 match self.haystack[pos..].chars().next() {
967 _ if is_match => SearchStep::Match(pos, pos),
968 None => SearchStep::Done,
969 Some(ch) => {
970 searcher.position += ch.len_utf8();
971 SearchStep::Reject(pos, searcher.position)
972 }
973 }
974 }
975 StrSearcherImpl::TwoWay(ref mut searcher) => {
976 // TwoWaySearcher produces valid *Match* indices that split at char boundaries
977 // as long as it does correct matching and that haystack and needle are
978 // valid UTF-8
979 // *Rejects* from the algorithm can fall on any indices, but we will walk them
980 // manually to the next character boundary, so that they are utf-8 safe.
981 if searcher.position == self.haystack.len() {
982 return SearchStep::Done;
983 }
984 let is_long = searcher.memory == usize::MAX;
985 match searcher.next::<RejectAndMatch>(
986 self.haystack.as_bytes(),
987 self.needle.as_bytes(),
988 is_long,
989 ) {
990 SearchStep::Reject(a, mut b) => {
991 // skip to next char boundary
992 while !self.haystack.is_char_boundary(b) {
993 b += 1;
994 }
995 searcher.position = cmp::max(b, searcher.position);
996 SearchStep::Reject(a, b)
997 }
998 otherwise => otherwise,
999 }
1000 }
1001 }
1002 }
1003
1004 #[inline]
1005 fn next_match(&mut self) -> Option<(usize, usize)> {
1006 match self.searcher {
1007 StrSearcherImpl::Empty(..) => loop {
1008 match self.next() {
1009 SearchStep::Match(a, b) => return Some((a, b)),
1010 SearchStep::Done => return None,
1011 SearchStep::Reject(..) => {}
1012 }
1013 },
1014 StrSearcherImpl::TwoWay(ref mut searcher) => {
1015 let is_long = searcher.memory == usize::MAX;
1016 // write out `true` and `false` cases to encourage the compiler
1017 // to specialize the two cases separately.
1018 if is_long {
1019 searcher.next::<MatchOnly>(
1020 self.haystack.as_bytes(),
1021 self.needle.as_bytes(),
1022 true,
1023 )
1024 } else {
1025 searcher.next::<MatchOnly>(
1026 self.haystack.as_bytes(),
1027 self.needle.as_bytes(),
1028 false,
1029 )
1030 }
1031 }
1032 }
1033 }
1034}
1035
1036unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
1037 #[inline]
1038 fn next_back(&mut self) -> SearchStep {
1039 match self.searcher {
1040 StrSearcherImpl::Empty(ref mut searcher) => {
1041 let is_match = searcher.is_match_bw;
1042 searcher.is_match_bw = !searcher.is_match_bw;
1043 let end = searcher.end;
1044 match self.haystack[..end].chars().next_back() {
1045 _ if is_match => SearchStep::Match(end, end),
1046 None => SearchStep::Done,
1047 Some(ch) => {
1048 searcher.end -= ch.len_utf8();
1049 SearchStep::Reject(searcher.end, end)
1050 }
1051 }
1052 }
1053 StrSearcherImpl::TwoWay(ref mut searcher) => {
1054 if searcher.end == 0 {
1055 return SearchStep::Done;
1056 }
1057 let is_long = searcher.memory == usize::MAX;
1058 match searcher.next_back::<RejectAndMatch>(
1059 self.haystack.as_bytes(),
1060 self.needle.as_bytes(),
1061 is_long,
1062 ) {
1063 SearchStep::Reject(mut a, b) => {
1064 // skip to next char boundary
1065 while !self.haystack.is_char_boundary(a) {
1066 a -= 1;
1067 }
1068 searcher.end = cmp::min(a, searcher.end);
1069 SearchStep::Reject(a, b)
1070 }
1071 otherwise => otherwise,
1072 }
1073 }
1074 }
1075 }
1076
1077 #[inline]
1078 fn next_match_back(&mut self) -> Option<(usize, usize)> {
1079 match self.searcher {
1080 StrSearcherImpl::Empty(..) => loop {
1081 match self.next_back() {
1082 SearchStep::Match(a, b) => return Some((a, b)),
1083 SearchStep::Done => return None,
1084 SearchStep::Reject(..) => {}
1085 }
1086 },
1087 StrSearcherImpl::TwoWay(ref mut searcher) => {
1088 let is_long = searcher.memory == usize::MAX;
1089 // write out `true` and `false`, like `next_match`
1090 if is_long {
1091 searcher.next_back::<MatchOnly>(
1092 self.haystack.as_bytes(),
1093 self.needle.as_bytes(),
1094 true,
1095 )
1096 } else {
1097 searcher.next_back::<MatchOnly>(
1098 self.haystack.as_bytes(),
1099 self.needle.as_bytes(),
1100 false,
1101 )
1102 }
1103 }
1104 }
1105 }
1106}
1107
1108/// The internal state of the two-way substring search algorithm.
1109#[derive(Clone, Debug)]
1110struct TwoWaySearcher {
1111 // constants
1112 /// critical factorization index
1113 crit_pos: usize,
1114 /// critical factorization index for reversed needle
1115 crit_pos_back: usize,
1116 period: usize,
1117 /// `byteset` is an extension (not part of the two way algorithm);
1118 /// it's a 64-bit "fingerprint" where each set bit `j` corresponds
1119 /// to a (byte & 63) == j present in the needle.
1120 byteset: u64,
1121
1122 // variables
1123 position: usize,
1124 end: usize,
1125 /// index into needle before which we have already matched
1126 memory: usize,
1127 /// index into needle after which we have already matched
1128 memory_back: usize,
1129}
1130
1131/*
1132 This is the Two-Way search algorithm, which was introduced in the paper:
1133 Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675.
1134
1135 Here's some background information.
1136
1137 A *word* is a string of symbols. The *length* of a word should be a familiar
1138 notion, and here we denote it for any word x by |x|.
1139 (We also allow for the possibility of the *empty word*, a word of length zero).
1140
1141 If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a
1142 *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p].
1143 For example, both 1 and 2 are periods for the string "aa". As another example,
1144 the only period of the string "abcd" is 4.
1145
1146 We denote by period(x) the *smallest* period of x (provided that x is non-empty).
1147 This is always well-defined since every non-empty word x has at least one period,
1148 |x|. We sometimes call this *the period* of x.
1149
1150 If u, v and x are words such that x = uv, where uv is the concatenation of u and
1151 v, then we say that (u, v) is a *factorization* of x.
1152
1153 Let (u, v) be a factorization for a word x. Then if w is a non-empty word such
1154 that both of the following hold
1155
1156 - either w is a suffix of u or u is a suffix of w
1157 - either w is a prefix of v or v is a prefix of w
1158
1159 then w is said to be a *repetition* for the factorization (u, v).
1160
1161 Just to unpack this, there are four possibilities here. Let w = "abc". Then we
1162 might have:
1163
1164 - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde")
1165 - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab")
1166 - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi")
1167 - u is a suffix of w and v is a prefix of w. ex: ("bc", "a")
1168
1169 Note that the word vu is a repetition for any factorization (u,v) of x = uv,
1170 so every factorization has at least one repetition.
1171
1172 If x is a string and (u, v) is a factorization for x, then a *local period* for
1173 (u, v) is an integer r such that there is some word w such that |w| = r and w is
1174 a repetition for (u, v).
1175
1176 We denote by local_period(u, v) the smallest local period of (u, v). We sometimes
1177 call this *the local period* of (u, v). Provided that x = uv is non-empty, this
1178 is well-defined (because each non-empty word has at least one factorization, as
1179 noted above).
1180
1181 It can be proven that the following is an equivalent definition of a local period
1182 for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for
1183 all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are
1184 defined. (i.e., i > 0 and i + r < |x|).
1185
1186 Using the above reformulation, it is easy to prove that
1187
1188 1 <= local_period(u, v) <= period(uv)
1189
1190 A factorization (u, v) of x such that local_period(u,v) = period(x) is called a
1191 *critical factorization*.
1192
1193 The algorithm hinges on the following theorem, which is stated without proof:
1194
1195 **Critical Factorization Theorem** Any word x has at least one critical
1196 factorization (u, v) such that |u| < period(x).
1197
1198 The purpose of maximal_suffix is to find such a critical factorization.
1199
1200 If the period is short, compute another factorization x = u' v' to use
1201 for reverse search, chosen instead so that |v'| < period(x).
1202
1203*/
1204impl TwoWaySearcher {
1205 fn new(needle: &[u8], end: usize) -> TwoWaySearcher {
1206 let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false);
1207 let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true);
1208
1209 let (crit_pos, period) = if crit_pos_false > crit_pos_true {
1210 (crit_pos_false, period_false)
1211 } else {
1212 (crit_pos_true, period_true)
1213 };
1214
1215 // A particularly readable explanation of what's going on here can be found
1216 // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically
1217 // see the code for "Algorithm CP" on p. 323.
1218 //
1219 // What's going on is we have some critical factorization (u, v) of the
1220 // needle, and we want to determine whether u is a suffix of
1221 // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use
1222 // "Algorithm CP2", which is optimized for when the period of the needle
1223 // is large.
1224 if needle[..crit_pos] == needle[period..period + crit_pos] {
1225 // short period case -- the period is exact
1226 // compute a separate critical factorization for the reversed needle
1227 // x = u' v' where |v'| < period(x).
1228 //
1229 // This is sped up by the period being known already.
1230 // Note that a case like x = "acba" may be factored exactly forwards
1231 // (crit_pos = 1, period = 3) while being factored with approximate
1232 // period in reverse (crit_pos = 2, period = 2). We use the given
1233 // reverse factorization but keep the exact period.
1234 let crit_pos_back = needle.len()
1235 - cmp::max(
1236 TwoWaySearcher::reverse_maximal_suffix(needle, period, false),
1237 TwoWaySearcher::reverse_maximal_suffix(needle, period, true),
1238 );
1239
1240 TwoWaySearcher {
1241 crit_pos,
1242 crit_pos_back,
1243 period,
1244 byteset: Self::byteset_create(&needle[..period]),
1245
1246 position: 0,
1247 end,
1248 memory: 0,
1249 memory_back: needle.len(),
1250 }
1251 } else {
1252 // long period case -- we have an approximation to the actual period,
1253 // and don't use memorization.
1254 //
1255 // Approximate the period by lower bound max(|u|, |v|) + 1.
1256 // The critical factorization is efficient to use for both forward and
1257 // reverse search.
1258
1259 TwoWaySearcher {
1260 crit_pos,
1261 crit_pos_back: crit_pos,
1262 period: cmp::max(crit_pos, needle.len() - crit_pos) + 1,
1263 byteset: Self::byteset_create(needle),
1264
1265 position: 0,
1266 end,
1267 memory: usize::MAX, // Dummy value to signify that the period is long
1268 memory_back: usize::MAX,
1269 }
1270 }
1271 }
1272
1273 #[inline]
1274 fn byteset_create(bytes: &[u8]) -> u64 {
1275 bytes.iter().fold(0, |a, &b| (1 << (b & 0x3f)) | a)
1276 }
1277
1278 #[inline]
1279 fn byteset_contains(&self, byte: u8) -> bool {
1280 (self.byteset >> ((byte & 0x3f) as usize)) & 1 != 0
1281 }
1282
1283 // One of the main ideas of Two-Way is that we factorize the needle into
1284 // two halves, (u, v), and begin trying to find v in the haystack by scanning
1285 // left to right. If v matches, we try to match u by scanning right to left.
1286 // How far we can jump when we encounter a mismatch is all based on the fact
1287 // that (u, v) is a critical factorization for the needle.
1288 #[inline]
1289 fn next<S>(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) -> S::Output
1290 where
1291 S: TwoWayStrategy,
1292 {
1293 // `next()` uses `self.position` as its cursor
1294 let old_pos = self.position;
1295 let needle_last = needle.len() - 1;
1296 'search: loop {
1297 // Check that we have room to search in
1298 // position + needle_last can not overflow if we assume slices
1299 // are bounded by isize's range.
1300 let tail_byte = match haystack.get(self.position + needle_last) {
1301 Some(&b) => b,
1302 None => {
1303 self.position = haystack.len();
1304 return S::rejecting(old_pos, self.position);
1305 }
1306 };
1307
1308 if S::use_early_reject() && old_pos != self.position {
1309 return S::rejecting(old_pos, self.position);
1310 }
1311
1312 // Quickly skip by large portions unrelated to our substring
1313 if !self.byteset_contains(tail_byte) {
1314 self.position += needle.len();
1315 if !long_period {
1316 self.memory = 0;
1317 }
1318 continue 'search;
1319 }
1320
1321 // See if the right part of the needle matches
1322 let start =
1323 if long_period { self.crit_pos } else { cmp::max(self.crit_pos, self.memory) };
1324 for i in start..needle.len() {
1325 if needle[i] != haystack[self.position + i] {
1326 self.position += i - self.crit_pos + 1;
1327 if !long_period {
1328 self.memory = 0;
1329 }
1330 continue 'search;
1331 }
1332 }
1333
1334 // See if the left part of the needle matches
1335 let start = if long_period { 0 } else { self.memory };
1336 for i in (start..self.crit_pos).rev() {
1337 if needle[i] != haystack[self.position + i] {
1338 self.position += self.period;
1339 if !long_period {
1340 self.memory = needle.len() - self.period;
1341 }
1342 continue 'search;
1343 }
1344 }
1345
1346 // We have found a match!
1347 let match_pos = self.position;
1348
1349 // Note: add self.period instead of needle.len() to have overlapping matches
1350 self.position += needle.len();
1351 if !long_period {
1352 self.memory = 0; // set to needle.len() - self.period for overlapping matches
1353 }
1354
1355 return S::matching(match_pos, match_pos + needle.len());
1356 }
1357 }
1358
1359 // Follows the ideas in `next()`.
1360 //
1361 // The definitions are symmetrical, with period(x) = period(reverse(x))
1362 // and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v)
1363 // is a critical factorization, so is (reverse(v), reverse(u)).
1364 //
1365 // For the reverse case we have computed a critical factorization x = u' v'
1366 // (field `crit_pos_back`). We need |u| < period(x) for the forward case and
1367 // thus |v'| < period(x) for the reverse.
1368 //
1369 // To search in reverse through the haystack, we search forward through
1370 // a reversed haystack with a reversed needle, matching first u' and then v'.
1371 #[inline]
1372 fn next_back<S>(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) -> S::Output
1373 where
1374 S: TwoWayStrategy,
1375 {
1376 // `next_back()` uses `self.end` as its cursor -- so that `next()` and `next_back()`
1377 // are independent.
1378 let old_end = self.end;
1379 'search: loop {
1380 // Check that we have room to search in
1381 // end - needle.len() will wrap around when there is no more room,
1382 // but due to slice length limits it can never wrap all the way back
1383 // into the length of haystack.
1384 let front_byte = match haystack.get(self.end.wrapping_sub(needle.len())) {
1385 Some(&b) => b,
1386 None => {
1387 self.end = 0;
1388 return S::rejecting(0, old_end);
1389 }
1390 };
1391
1392 if S::use_early_reject() && old_end != self.end {
1393 return S::rejecting(self.end, old_end);
1394 }
1395
1396 // Quickly skip by large portions unrelated to our substring
1397 if !self.byteset_contains(front_byte) {
1398 self.end -= needle.len();
1399 if !long_period {
1400 self.memory_back = needle.len();
1401 }
1402 continue 'search;
1403 }
1404
1405 // See if the left part of the needle matches
1406 let crit = if long_period {
1407 self.crit_pos_back
1408 } else {
1409 cmp::min(self.crit_pos_back, self.memory_back)
1410 };
1411 for i in (0..crit).rev() {
1412 if needle[i] != haystack[self.end - needle.len() + i] {
1413 self.end -= self.crit_pos_back - i;
1414 if !long_period {
1415 self.memory_back = needle.len();
1416 }
1417 continue 'search;
1418 }
1419 }
1420
1421 // See if the right part of the needle matches
1422 let needle_end = if long_period { needle.len() } else { self.memory_back };
1423 for i in self.crit_pos_back..needle_end {
1424 if needle[i] != haystack[self.end - needle.len() + i] {
1425 self.end -= self.period;
1426 if !long_period {
1427 self.memory_back = self.period;
1428 }
1429 continue 'search;
1430 }
1431 }
1432
1433 // We have found a match!
1434 let match_pos = self.end - needle.len();
1435 // Note: sub self.period instead of needle.len() to have overlapping matches
1436 self.end -= needle.len();
1437 if !long_period {
1438 self.memory_back = needle.len();
1439 }
1440
1441 return S::matching(match_pos, match_pos + needle.len());
1442 }
1443 }
1444
1445 // Compute the maximal suffix of `arr`.
1446 //
1447 // The maximal suffix is a possible critical factorization (u, v) of `arr`.
1448 //
1449 // Returns (`i`, `p`) where `i` is the starting index of v and `p` is the
1450 // period of v.
1451 //
1452 // `order_greater` determines if lexical order is `<` or `>`. Both
1453 // orders must be computed -- the ordering with the largest `i` gives
1454 // a critical factorization.
1455 //
1456 // For long period cases, the resulting period is not exact (it is too short).
1457 #[inline]
1458 fn maximal_suffix(arr: &[u8], order_greater: bool) -> (usize, usize) {
1459 let mut left = 0; // Corresponds to i in the paper
1460 let mut right = 1; // Corresponds to j in the paper
1461 let mut offset = 0; // Corresponds to k in the paper, but starting at 0
1462 // to match 0-based indexing.
1463 let mut period = 1; // Corresponds to p in the paper
1464
1465 while let Some(&a) = arr.get(right + offset) {
1466 // `left` will be inbounds when `right` is.
1467 let b = arr[left + offset];
1468 if (a < b && !order_greater) || (a > b && order_greater) {
1469 // Suffix is smaller, period is entire prefix so far.
1470 right += offset + 1;
1471 offset = 0;
1472 period = right - left;
1473 } else if a == b {
1474 // Advance through repetition of the current period.
1475 if offset + 1 == period {
1476 right += offset + 1;
1477 offset = 0;
1478 } else {
1479 offset += 1;
1480 }
1481 } else {
1482 // Suffix is larger, start over from current location.
1483 left = right;
1484 right += 1;
1485 offset = 0;
1486 period = 1;
1487 }
1488 }
1489 (left, period)
1490 }
1491
1492 // Compute the maximal suffix of the reverse of `arr`.
1493 //
1494 // The maximal suffix is a possible critical factorization (u', v') of `arr`.
1495 //
1496 // Returns `i` where `i` is the starting index of v', from the back;
1497 // returns immediately when a period of `known_period` is reached.
1498 //
1499 // `order_greater` determines if lexical order is `<` or `>`. Both
1500 // orders must be computed -- the ordering with the largest `i` gives
1501 // a critical factorization.
1502 //
1503 // For long period cases, the resulting period is not exact (it is too short).
1504 fn reverse_maximal_suffix(arr: &[u8], known_period: usize, order_greater: bool) -> usize {
1505 let mut left = 0; // Corresponds to i in the paper
1506 let mut right = 1; // Corresponds to j in the paper
1507 let mut offset = 0; // Corresponds to k in the paper, but starting at 0
1508 // to match 0-based indexing.
1509 let mut period = 1; // Corresponds to p in the paper
1510 let n = arr.len();
1511
1512 while right + offset < n {
1513 let a = arr[n - (1 + right + offset)];
1514 let b = arr[n - (1 + left + offset)];
1515 if (a < b && !order_greater) || (a > b && order_greater) {
1516 // Suffix is smaller, period is entire prefix so far.
1517 right += offset + 1;
1518 offset = 0;
1519 period = right - left;
1520 } else if a == b {
1521 // Advance through repetition of the current period.
1522 if offset + 1 == period {
1523 right += offset + 1;
1524 offset = 0;
1525 } else {
1526 offset += 1;
1527 }
1528 } else {
1529 // Suffix is larger, start over from current location.
1530 left = right;
1531 right += 1;
1532 offset = 0;
1533 period = 1;
1534 }
1535 if period == known_period {
1536 break;
1537 }
1538 }
1539 debug_assert!(period <= known_period);
1540 left
1541 }
1542}
1543
1544// TwoWayStrategy allows the algorithm to either skip non-matches as quickly
1545// as possible, or to work in a mode where it emits Rejects relatively quickly.
1546trait TwoWayStrategy {
1547 type Output;
1548 fn use_early_reject() -> bool;
1549 fn rejecting(a: usize, b: usize) -> Self::Output;
1550 fn matching(a: usize, b: usize) -> Self::Output;
1551}
1552
1553/// Skip to match intervals as quickly as possible
1554enum MatchOnly {}
1555
1556impl TwoWayStrategy for MatchOnly {
1557 type Output = Option<(usize, usize)>;
1558
1559 #[inline]
1560 fn use_early_reject() -> bool {
1561 false
1562 }
1563 #[inline]
1564 fn rejecting(_a: usize, _b: usize) -> Self::Output {
1565 None
1566 }
1567 #[inline]
1568 fn matching(a: usize, b: usize) -> Self::Output {
1569 Some((a, b))
1570 }
1571}
1572
1573/// Emit Rejects regularly
1574enum RejectAndMatch {}
1575
1576impl TwoWayStrategy for RejectAndMatch {
1577 type Output = SearchStep;
1578
1579 #[inline]
1580 fn use_early_reject() -> bool {
1581 true
1582 }
1583 #[inline]
1584 fn rejecting(a: usize, b: usize) -> Self::Output {
1585 SearchStep::Reject(a, b)
1586 }
1587 #[inline]
1588 fn matching(a: usize, b: usize) -> Self::Output {
1589 SearchStep::Match(a, b)
1590 }
1591}