1#![unstable(
14 feature = "wtf8_internals",
15 issue = "none",
16 reason = "this is internal code for representing OsStr on some platforms and not a public API"
17)]
18#![doc(hidden)]
21
22use crate::char::{MAX_LEN_UTF16, encode_utf16_raw};
23use crate::clone::CloneToUninit;
24use crate::fmt::{self, Write};
25use crate::hash::{Hash, Hasher};
26use crate::iter::FusedIterator;
27use crate::num::niche_types::CodePointInner;
28use crate::str::next_code_point;
29use crate::{ops, slice, str};
30
31#[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)]
37#[doc(hidden)]
38pub struct CodePoint(CodePointInner);
39
40impl fmt::Debug for CodePoint {
43 #[inline]
44 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
45 write!(formatter, "U+{:04X}", self.0.as_inner())
46 }
47}
48
49impl CodePoint {
50 #[inline]
54 pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint {
55 CodePoint(unsafe { CodePointInner::new_unchecked(value) })
57 }
58
59 #[inline]
63 pub fn from_u32(value: u32) -> Option<CodePoint> {
64 Some(CodePoint(CodePointInner::new(value)?))
65 }
66
67 #[inline]
71 pub fn from_char(value: char) -> CodePoint {
72 unsafe { CodePoint::from_u32_unchecked(value as u32) }
74 }
75
76 #[inline]
78 pub fn to_u32(&self) -> u32 {
79 self.0.as_inner()
80 }
81
82 #[inline]
84 pub fn to_lead_surrogate(&self) -> Option<u16> {
85 match self.to_u32() {
86 lead @ 0xD800..=0xDBFF => Some(lead as u16),
87 _ => None,
88 }
89 }
90
91 #[inline]
93 pub fn to_trail_surrogate(&self) -> Option<u16> {
94 match self.to_u32() {
95 trail @ 0xDC00..=0xDFFF => Some(trail as u16),
96 _ => None,
97 }
98 }
99
100 #[inline]
104 pub fn to_char(&self) -> Option<char> {
105 match self.to_u32() {
106 0xD800..=0xDFFF => None,
107 valid => Some(unsafe { char::from_u32_unchecked(valid) }),
109 }
110 }
111
112 #[inline]
117 pub fn to_char_lossy(&self) -> char {
118 self.to_char().unwrap_or(char::REPLACEMENT_CHARACTER)
119 }
120}
121
122#[derive(Eq, Ord, PartialEq, PartialOrd)]
127#[repr(transparent)]
128#[rustc_has_incoherent_inherent_impls]
129#[doc(hidden)]
130pub struct Wtf8 {
131 bytes: [u8],
132}
133
134impl AsRef<[u8]> for Wtf8 {
135 #[inline]
136 fn as_ref(&self) -> &[u8] {
137 &self.bytes
138 }
139}
140
141impl fmt::Debug for Wtf8 {
145 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
146 fn write_str_escaped(f: &mut fmt::Formatter<'_>, s: &str) -> fmt::Result {
147 use crate::fmt::Write;
148 for c in s.chars().flat_map(|c| c.escape_debug()) {
149 f.write_char(c)?
150 }
151 Ok(())
152 }
153
154 formatter.write_str("\"")?;
155 let mut pos = 0;
156 while let Some((surrogate_pos, surrogate)) = self.next_surrogate(pos) {
157 write_str_escaped(formatter, unsafe {
159 str::from_utf8_unchecked(&self.bytes[pos..surrogate_pos])
160 })?;
161 write!(formatter, "\\u{{{:x}}}", surrogate)?;
162 pos = surrogate_pos + 3;
163 }
164
165 write_str_escaped(formatter, unsafe { str::from_utf8_unchecked(&self.bytes[pos..]) })?;
167 formatter.write_str("\"")
168 }
169}
170
171impl fmt::Display for Wtf8 {
174 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
175 let wtf8_bytes = &self.bytes;
176 let mut pos = 0;
177 loop {
178 match self.next_surrogate(pos) {
179 Some((surrogate_pos, _)) => {
180 formatter.write_str(unsafe {
182 str::from_utf8_unchecked(&wtf8_bytes[pos..surrogate_pos])
183 })?;
184 formatter.write_char(char::REPLACEMENT_CHARACTER)?;
185 pos = surrogate_pos + 3;
186 }
187 None => {
188 let s = unsafe { str::from_utf8_unchecked(&wtf8_bytes[pos..]) };
190 if pos == 0 { return s.fmt(formatter) } else { return formatter.write_str(s) }
191 }
192 }
193 }
194 }
195}
196
197impl Wtf8 {
198 #[inline]
200 pub fn from_str(value: &str) -> &Wtf8 {
201 unsafe { Wtf8::from_bytes_unchecked(value.as_bytes()) }
203 }
204
205 #[inline]
210 pub unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
211 unsafe { &*(value as *const [u8] as *const Wtf8) }
213 }
214
215 #[inline]
220 pub unsafe fn from_mut_bytes_unchecked(value: &mut [u8]) -> &mut Wtf8 {
221 unsafe { &mut *(value as *mut [u8] as *mut Wtf8) }
223 }
224
225 #[inline]
227 pub fn len(&self) -> usize {
228 self.bytes.len()
229 }
230
231 #[inline]
232 pub fn is_empty(&self) -> bool {
233 self.bytes.is_empty()
234 }
235
236 #[inline]
243 pub fn ascii_byte_at(&self, position: usize) -> u8 {
244 match self.bytes[position] {
245 ascii_byte @ 0x00..=0x7F => ascii_byte,
246 _ => 0xFF,
247 }
248 }
249
250 #[inline]
252 pub fn code_points(&self) -> Wtf8CodePoints<'_> {
253 Wtf8CodePoints { bytes: self.bytes.iter() }
254 }
255
256 #[inline]
258 pub fn as_bytes(&self) -> &[u8] {
259 &self.bytes
260 }
261
262 #[inline]
268 pub fn as_str(&self) -> Result<&str, str::Utf8Error> {
269 str::from_utf8(&self.bytes)
270 }
271
272 #[inline]
279 pub fn encode_wide(&self) -> EncodeWide<'_> {
280 EncodeWide { code_points: self.code_points(), extra: 0 }
281 }
282
283 #[inline]
284 pub fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> {
285 let mut iter = self.bytes[pos..].iter();
286 loop {
287 let b = *iter.next()?;
288 if b < 0x80 {
289 pos += 1;
290 } else if b < 0xE0 {
291 iter.next();
292 pos += 2;
293 } else if b == 0xED {
294 match (iter.next(), iter.next()) {
295 (Some(&b2), Some(&b3)) if b2 >= 0xA0 => {
296 return Some((pos, decode_surrogate(b2, b3)));
297 }
298 _ => pos += 3,
299 }
300 } else if b < 0xF0 {
301 iter.next();
302 iter.next();
303 pos += 3;
304 } else {
305 iter.next();
306 iter.next();
307 iter.next();
308 pos += 4;
309 }
310 }
311 }
312
313 #[inline]
314 pub fn final_lead_surrogate(&self) -> Option<u16> {
315 match self.bytes {
316 [.., 0xED, b2 @ 0xA0..=0xAF, b3] => Some(decode_surrogate(b2, b3)),
317 _ => None,
318 }
319 }
320
321 #[inline]
322 pub fn initial_trail_surrogate(&self) -> Option<u16> {
323 match self.bytes {
324 [0xED, b2 @ 0xB0..=0xBF, b3, ..] => Some(decode_surrogate(b2, b3)),
325 _ => None,
326 }
327 }
328
329 #[inline]
330 pub fn make_ascii_lowercase(&mut self) {
331 self.bytes.make_ascii_lowercase()
332 }
333
334 #[inline]
335 pub fn make_ascii_uppercase(&mut self) {
336 self.bytes.make_ascii_uppercase()
337 }
338
339 #[inline]
340 pub fn is_ascii(&self) -> bool {
341 self.bytes.is_ascii()
342 }
343
344 #[inline]
345 pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
346 self.bytes.eq_ignore_ascii_case(&other.bytes)
347 }
348}
349
350impl ops::Index<ops::Range<usize>> for Wtf8 {
357 type Output = Wtf8;
358
359 #[inline]
360 fn index(&self, range: ops::Range<usize>) -> &Wtf8 {
361 if range.start <= range.end
362 && self.is_code_point_boundary(range.start)
363 && self.is_code_point_boundary(range.end)
364 {
365 unsafe { slice_unchecked(self, range.start, range.end) }
367 } else {
368 slice_error_fail(self, range.start, range.end)
369 }
370 }
371}
372
373impl ops::Index<ops::RangeFrom<usize>> for Wtf8 {
380 type Output = Wtf8;
381
382 #[inline]
383 fn index(&self, range: ops::RangeFrom<usize>) -> &Wtf8 {
384 if self.is_code_point_boundary(range.start) {
385 unsafe { slice_unchecked(self, range.start, self.len()) }
387 } else {
388 slice_error_fail(self, range.start, self.len())
389 }
390 }
391}
392
393impl ops::Index<ops::RangeTo<usize>> for Wtf8 {
400 type Output = Wtf8;
401
402 #[inline]
403 fn index(&self, range: ops::RangeTo<usize>) -> &Wtf8 {
404 if self.is_code_point_boundary(range.end) {
405 unsafe { slice_unchecked(self, 0, range.end) }
407 } else {
408 slice_error_fail(self, 0, range.end)
409 }
410 }
411}
412
413impl ops::Index<ops::RangeFull> for Wtf8 {
414 type Output = Wtf8;
415
416 #[inline]
417 fn index(&self, _range: ops::RangeFull) -> &Wtf8 {
418 self
419 }
420}
421
422#[inline]
423fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 {
424 0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F
426}
427
428impl Wtf8 {
429 #[inline]
431 pub fn is_code_point_boundary(&self, index: usize) -> bool {
432 if index == 0 {
433 return true;
434 }
435 match self.bytes.get(index) {
436 None => index == self.len(),
437 Some(&b) => (b as i8) >= -0x40,
438 }
439 }
440
441 #[track_caller]
449 #[inline]
450 pub fn check_utf8_boundary(&self, index: usize) {
451 if index == 0 {
452 return;
453 }
454 match self.bytes.get(index) {
455 Some(0xED) => (), Some(&b) if (b as i8) >= -0x40 => return,
457 Some(_) => panic!("byte index {index} is not a codepoint boundary"),
458 None if index == self.len() => return,
459 None => panic!("byte index {index} is out of bounds"),
460 }
461 if self.bytes[index + 1] >= 0xA0 {
462 if index >= 3 && self.bytes[index - 3] == 0xED && self.bytes[index - 2] >= 0xA0 {
464 panic!("byte index {index} lies between surrogate codepoints");
465 }
466 }
467 }
468}
469
470#[inline]
472unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 {
473 unsafe {
475 let len = end - begin;
476 let start = s.as_bytes().as_ptr().add(begin);
477 Wtf8::from_bytes_unchecked(slice::from_raw_parts(start, len))
478 }
479}
480
481#[inline(never)]
483fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! {
484 assert!(begin <= end);
485 panic!("index {begin} and/or {end} in `{s:?}` do not lie on character boundary");
486}
487
488#[derive(Clone)]
492#[doc(hidden)]
493pub struct Wtf8CodePoints<'a> {
494 bytes: slice::Iter<'a, u8>,
495}
496
497impl Iterator for Wtf8CodePoints<'_> {
498 type Item = CodePoint;
499
500 #[inline]
501 fn next(&mut self) -> Option<CodePoint> {
502 unsafe { next_code_point(&mut self.bytes).map(|c| CodePoint::from_u32_unchecked(c)) }
504 }
505
506 #[inline]
507 fn size_hint(&self) -> (usize, Option<usize>) {
508 let len = self.bytes.len();
509 (len.saturating_add(3) / 4, Some(len))
510 }
511}
512
513impl fmt::Debug for Wtf8CodePoints<'_> {
514 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
515 f.debug_tuple("Wtf8CodePoints")
516 .field(&unsafe { Wtf8::from_bytes_unchecked(self.bytes.as_slice()) })
518 .finish()
519 }
520}
521
522#[stable(feature = "rust1", since = "1.0.0")]
524#[derive(Clone)]
525#[doc(hidden)]
526pub struct EncodeWide<'a> {
527 code_points: Wtf8CodePoints<'a>,
528 extra: u16,
529}
530
531#[stable(feature = "rust1", since = "1.0.0")]
533impl Iterator for EncodeWide<'_> {
534 type Item = u16;
535
536 #[inline]
537 fn next(&mut self) -> Option<u16> {
538 if self.extra != 0 {
539 let tmp = self.extra;
540 self.extra = 0;
541 return Some(tmp);
542 }
543
544 let mut buf = [0; MAX_LEN_UTF16];
545 self.code_points.next().map(|code_point| {
546 let n = encode_utf16_raw(code_point.to_u32(), &mut buf).len();
547 if n == 2 {
548 self.extra = buf[1];
549 }
550 buf[0]
551 })
552 }
553
554 #[inline]
555 fn size_hint(&self) -> (usize, Option<usize>) {
556 let (low, high) = self.code_points.size_hint();
557 let ext = (self.extra != 0) as usize;
558 (low + ext, high.and_then(|n| n.checked_mul(2)).and_then(|n| n.checked_add(ext)))
562 }
563}
564
565impl fmt::Debug for EncodeWide<'_> {
566 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
567 f.debug_struct("EncodeWide").finish_non_exhaustive()
568 }
569}
570
571#[stable(feature = "encode_wide_fused_iterator", since = "1.62.0")]
572impl FusedIterator for EncodeWide<'_> {}
573
574impl Hash for CodePoint {
575 #[inline]
576 fn hash<H: Hasher>(&self, state: &mut H) {
577 self.0.hash(state)
578 }
579}
580
581impl Hash for Wtf8 {
582 #[inline]
583 fn hash<H: Hasher>(&self, state: &mut H) {
584 state.write(&self.bytes);
585 0xfeu8.hash(state)
586 }
587}
588
589#[unstable(feature = "clone_to_uninit", issue = "126799")]
590unsafe impl CloneToUninit for Wtf8 {
591 #[inline]
592 #[cfg_attr(debug_assertions, track_caller)]
593 unsafe fn clone_to_uninit(&self, dst: *mut u8) {
594 unsafe { self.bytes.clone_to_uninit(dst) }
596 }
597}