rustdoc/html/render/search_index/
encode.rs

1use base64::prelude::*;
2
3pub(crate) fn write_vlqhex_to_string(n: i32, string: &mut String) {
4    let (sign, magnitude): (bool, u32) =
5        if n >= 0 { (false, n.try_into().unwrap()) } else { (true, (-n).try_into().unwrap()) };
6    // zig-zag encoding
7    let value: u32 = (magnitude << 1) | (if sign { 1 } else { 0 });
8    // Self-terminating hex use capital letters for everything but the
9    // least significant digit, which is lowercase. For example, decimal 17
10    // would be `` Aa `` if zig-zag encoding weren't used.
11    //
12    // Zig-zag encoding, however, stores the sign bit as the last bit.
13    // This means, in the last hexit, 1 is actually `c`, -1 is `b`
14    // (`a` is the imaginary -0), and, because all the bits are shifted
15    // by one, `` A` `` is actually 8 and `` Aa `` is -8.
16    //
17    // https://rust-lang.github.io/rustc-dev-guide/rustdoc-internals/search.html
18    // describes the encoding in more detail.
19    let mut shift: u32 = 28;
20    let mut mask: u32 = 0xF0_00_00_00;
21    // first skip leading zeroes
22    while shift < 32 {
23        let hexit = (value & mask) >> shift;
24        if hexit != 0 || shift == 0 {
25            break;
26        }
27        shift = shift.wrapping_sub(4);
28        mask >>= 4;
29    }
30    // now write the rest
31    while shift < 32 {
32        let hexit = (value & mask) >> shift;
33        let hex = char::try_from(if shift == 0 { '`' } else { '@' } as u32 + hexit).unwrap();
34        string.push(hex);
35        shift = shift.wrapping_sub(4);
36        mask >>= 4;
37    }
38}
39
40// Used during bitmap encoding
41enum Container {
42    /// number of ones, bits
43    Bits(Box<[u64; 1024]>),
44    /// list of entries
45    Array(Vec<u16>),
46    /// list of (start, len-1)
47    Run(Vec<(u16, u16)>),
48}
49impl Container {
50    fn popcount(&self) -> u32 {
51        match self {
52            Container::Bits(bits) => bits.iter().copied().map(|x| x.count_ones()).sum(),
53            Container::Array(array) => {
54                array.len().try_into().expect("array can't be bigger than 2**32")
55            }
56            Container::Run(runs) => {
57                runs.iter().copied().map(|(_, lenm1)| u32::from(lenm1) + 1).sum()
58            }
59        }
60    }
61    fn push(&mut self, value: u16) {
62        match self {
63            Container::Bits(bits) => bits[value as usize >> 6] |= 1 << (value & 0x3F),
64            Container::Array(array) => {
65                array.push(value);
66                if array.len() >= 4096 {
67                    let array = std::mem::take(array);
68                    *self = Container::Bits(Box::new([0; 1024]));
69                    for value in array {
70                        self.push(value);
71                    }
72                }
73            }
74            Container::Run(runs) => {
75                if let Some(r) = runs.last_mut()
76                    && r.0 + r.1 + 1 == value
77                {
78                    r.1 += 1;
79                } else {
80                    runs.push((value, 0));
81                }
82            }
83        }
84    }
85    fn try_make_run(&mut self) -> bool {
86        match self {
87            Container::Bits(bits) => {
88                let mut r: u64 = 0;
89                for (i, chunk) in bits.iter().copied().enumerate() {
90                    let next_chunk =
91                        i.checked_add(1).and_then(|i| bits.get(i)).copied().unwrap_or(0);
92                    r += !chunk & u64::from((chunk << 1).count_ones());
93                    r += !next_chunk & u64::from((chunk >> 63).count_ones());
94                }
95                if (2 + 4 * r) >= 8192 {
96                    return false;
97                }
98                let bits = std::mem::replace(bits, Box::new([0; 1024]));
99                *self = Container::Run(Vec::new());
100                for (i, bits) in bits.iter().copied().enumerate() {
101                    if bits == 0 {
102                        continue;
103                    }
104                    for j in 0..64 {
105                        let value = (u16::try_from(i).unwrap() << 6) | j;
106                        if bits & (1 << j) != 0 {
107                            self.push(value);
108                        }
109                    }
110                }
111                true
112            }
113            Container::Array(array) if array.len() <= 5 => false,
114            Container::Array(array) => {
115                let mut r = 0;
116                let mut prev = None;
117                for value in array.iter().copied() {
118                    if value.checked_sub(1) != prev {
119                        r += 1;
120                    }
121                    prev = Some(value);
122                }
123                if 2 + 4 * r >= 2 * array.len() + 2 {
124                    return false;
125                }
126                let array = std::mem::take(array);
127                *self = Container::Run(Vec::new());
128                for value in array {
129                    self.push(value);
130                }
131                true
132            }
133            Container::Run(_) => true,
134        }
135    }
136}
137
138// checked against roaring-rs in
139// https://gitlab.com/notriddle/roaring-test
140pub(crate) fn write_bitmap_to_bytes(
141    domain: &[u32],
142    mut out: impl std::io::Write,
143) -> std::io::Result<()> {
144    // https://arxiv.org/pdf/1603.06549.pdf
145    let mut keys = Vec::<u16>::new();
146    let mut containers = Vec::<Container>::new();
147    let mut key: u16;
148    let mut domain_iter = domain.iter().copied().peekable();
149    let mut has_run = false;
150    while let Some(entry) = domain_iter.next() {
151        key = (entry >> 16).try_into().expect("shifted off the top 16 bits, so it should fit");
152        let value: u16 = (entry & 0x00_00_FF_FF).try_into().expect("AND 16 bits, so it should fit");
153        let mut container = Container::Array(vec![value]);
154        while let Some(entry) = domain_iter.peek().copied() {
155            let entry_key: u16 =
156                (entry >> 16).try_into().expect("shifted off the top 16 bits, so it should fit");
157            if entry_key != key {
158                break;
159            }
160            domain_iter.next().expect("peeking just succeeded");
161            container
162                .push((entry & 0x00_00_FF_FF).try_into().expect("AND 16 bits, so it should fit"));
163        }
164        keys.push(key);
165        has_run = container.try_make_run() || has_run;
166        containers.push(container);
167    }
168    // https://github.com/RoaringBitmap/RoaringFormatSpec
169    const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346;
170    const SERIAL_COOKIE: u32 = 12347;
171    const NO_OFFSET_THRESHOLD: u32 = 4;
172    let size: u32 = containers.len().try_into().unwrap();
173    let start_offset = if has_run {
174        out.write_all(&u32::to_le_bytes(SERIAL_COOKIE | ((size - 1) << 16)))?;
175        for set in containers.chunks(8) {
176            let mut b = 0;
177            for (i, container) in set.iter().enumerate() {
178                if matches!(container, &Container::Run(..)) {
179                    b |= 1 << i;
180                }
181            }
182            out.write_all(&[b])?;
183        }
184        if size < NO_OFFSET_THRESHOLD {
185            4 + 4 * size + ((size + 7) / 8)
186        } else {
187            4 + 8 * size + ((size + 7) / 8)
188        }
189    } else {
190        out.write_all(&u32::to_le_bytes(SERIAL_COOKIE_NO_RUNCONTAINER))?;
191        out.write_all(&u32::to_le_bytes(containers.len().try_into().unwrap()))?;
192        4 + 4 + 4 * size + 4 * size
193    };
194    for (&key, container) in keys.iter().zip(&containers) {
195        // descriptive header
196        let key: u32 = key.into();
197        let count: u32 = container.popcount() - 1;
198        out.write_all(&u32::to_le_bytes((count << 16) | key))?;
199    }
200    if !has_run || size >= NO_OFFSET_THRESHOLD {
201        // offset header
202        let mut starting_offset = start_offset;
203        for container in &containers {
204            out.write_all(&u32::to_le_bytes(starting_offset))?;
205            starting_offset += match container {
206                Container::Bits(_) => 8192u32,
207                Container::Array(array) => u32::try_from(array.len()).unwrap() * 2,
208                Container::Run(runs) => 2 + u32::try_from(runs.len()).unwrap() * 4,
209            };
210        }
211    }
212    for container in &containers {
213        match container {
214            Container::Bits(bits) => {
215                for chunk in bits.iter() {
216                    out.write_all(&u64::to_le_bytes(*chunk))?;
217                }
218            }
219            Container::Array(array) => {
220                for value in array.iter() {
221                    out.write_all(&u16::to_le_bytes(*value))?;
222                }
223            }
224            Container::Run(runs) => {
225                out.write_all(&u16::to_le_bytes(runs.len().try_into().unwrap()))?;
226                for (start, lenm1) in runs.iter().copied() {
227                    out.write_all(&u16::to_le_bytes(start))?;
228                    out.write_all(&u16::to_le_bytes(lenm1))?;
229                }
230            }
231        }
232    }
233    Ok(())
234}
235
236pub(crate) fn bitmap_to_string(domain: &[u32]) -> String {
237    let mut buf = Vec::new();
238    let mut strbuf = String::new();
239    write_bitmap_to_bytes(domain, &mut buf).unwrap();
240    BASE64_STANDARD.encode_string(&buf, &mut strbuf);
241    strbuf
242}