Files
adblock
addr
addr2line
adler
aho_corasick
backtrace
base64
bitflags
byteorder
cfg_if
crc32fast
ctor
either
error_chain
flate2
foreign_types
foreign_types_shared
ghost
gimli
idna
indoc
indoc_impl
instant
inventory
inventory_impl
itertools
lazy_static
libc
lock_api
log
matches
memchr
miniz_oxide
native_tls
num_traits
object
once_cell
openssl
openssl_probe
openssl_sys
parking_lot
parking_lot_core
paste
paste_impl
percent_encoding
proc_macro2
proc_macro_hack
psl
psl_codegen
psl_lexer
pyo3
pyo3_derive_backend
pyo3cls
quote
regex
regex_syntax
rental
rental_impl
rmp
rmp_serde
rustc_demangle
scopeguard
seahash
sequence_trie
serde
serde_derive
smallvec
stable_deref_trait
syn
thread_local
tinyvec
twoway
unchecked_index
unicode_bidi
unicode_normalization
unicode_xid
unindent
url
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
use crate::lookups::{
    canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed,
    stream_safe_trailing_nonstarters,
};
use crate::normalize::{hangul_decomposition_length, is_hangul_syllable};
use crate::tables::stream_safe_leading_nonstarters;

pub(crate) const MAX_NONSTARTERS: usize = 30;
const COMBINING_GRAPHEME_JOINER: char = '\u{034F}';

/// UAX15-D4: This iterator keeps track of how many non-starters there have been
/// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner
/// (U+034F) if the count exceeds 30.
pub struct StreamSafe<I> {
    iter: I,
    nonstarter_count: usize,
    buffer: Option<char>,
}

impl<I> StreamSafe<I> {
    pub(crate) fn new(iter: I) -> Self {
        Self {
            iter,
            nonstarter_count: 0,
            buffer: None,
        }
    }
}

impl<I: Iterator<Item = char>> Iterator for StreamSafe<I> {
    type Item = char;

    #[inline]
    fn next(&mut self) -> Option<char> {
        if let Some(ch) = self.buffer.take() {
            return Some(ch);
        }
        let next_ch = match self.iter.next() {
            None => return None,
            Some(c) => c,
        };
        let d = classify_nonstarters(next_ch);
        if self.nonstarter_count + d.leading_nonstarters > MAX_NONSTARTERS {
            self.buffer = Some(next_ch);
            self.nonstarter_count = 0;
            return Some(COMBINING_GRAPHEME_JOINER);
        }

        // No starters in the decomposition, so keep accumulating
        if d.leading_nonstarters == d.decomposition_len {
            self.nonstarter_count += d.decomposition_len;
        }
        // Otherwise, restart the nonstarter counter.
        else {
            self.nonstarter_count = d.trailing_nonstarters;
        }
        Some(next_ch)
    }
}

#[derive(Debug)]
pub(crate) struct Decomposition {
    pub(crate) leading_nonstarters: usize,
    pub(crate) trailing_nonstarters: usize,
    pub(crate) decomposition_len: usize,
}

#[inline]
pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
    // As usual, fast path for ASCII (which is always a starter)
    if c <= '\x7f' {
        return Decomposition {
            leading_nonstarters: 0,
            trailing_nonstarters: 0,
            decomposition_len: 1,
        };
    }
    // Next, special case Hangul, since it's not handled by our tables.
    if is_hangul_syllable(c) {
        return Decomposition {
            leading_nonstarters: 0,
            trailing_nonstarters: 0,
            decomposition_len: hangul_decomposition_length(c),
        };
    }
    let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
    match decomp {
        Some(decomp) => Decomposition {
            leading_nonstarters: stream_safe_leading_nonstarters(c),
            trailing_nonstarters: stream_safe_trailing_nonstarters(c),
            decomposition_len: decomp.len(),
        },
        None => {
            let is_nonstarter = canonical_combining_class(c) != 0;
            let nonstarter = if is_nonstarter { 1 } else { 0 };
            Decomposition {
                leading_nonstarters: nonstarter,
                trailing_nonstarters: nonstarter,
                decomposition_len: 1,
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use super::{classify_nonstarters, StreamSafe};
    use crate::lookups::canonical_combining_class;
    use crate::normalize::decompose_compatible;

    #[cfg(not(feature = "std"))]
    use crate::no_std_prelude::*;

    use core::char;

    fn stream_safe(s: &str) -> String {
        StreamSafe::new(s.chars()).collect()
    }

    #[test]
    fn test_simple() {
        let technically_okay = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}ngerzone";
        assert_eq!(stream_safe(technically_okay), technically_okay);

        let too_much = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
        assert_ne!(stream_safe(too_much), too_much);
    }

    #[test]
    fn test_classify_nonstarters() {
        // Highest character in the `compat_fully_decomp` table is 2FA1D
        for ch in 0..0x2FA1E {
            let ch = match char::from_u32(ch) {
                Some(c) => c,
                None => continue,
            };
            let c = classify_nonstarters(ch);
            let mut s = Vec::new();
            decompose_compatible(ch, |c| s.push(c));

            assert_eq!(s.len(), c.decomposition_len);

            let num_leading = s
                .iter()
                .take_while(|&c| canonical_combining_class(*c) != 0)
                .count();
            let num_trailing = s
                .iter()
                .rev()
                .take_while(|&c| canonical_combining_class(*c) != 0)
                .count();

            assert_eq!(num_leading, c.leading_nonstarters);
            assert_eq!(num_trailing, c.trailing_nonstarters);
        }
    }
}