libstdc++
text_encoding
Go to the documentation of this file.
1// <text_encoding> -*- C++ -*-
2
3// Copyright The GNU Toolchain Authors.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25/** @file include/text_encoding
26 * This is a Standard C++ Library header.
27 */
28
29#ifndef _GLIBCXX_TEXT_ENCODING
30#define _GLIBCXX_TEXT_ENCODING
31
32#pragma GCC system_header
33
34#include <bits/requires_hosted.h>
35
36#define __glibcxx_want_text_encoding
37#include <bits/version.h>
38
39#ifdef __cpp_lib_text_encoding
40#include <compare>
41#include <string_view>
42#include <bits/functional_hash.h> // hash
43#include <bits/ranges_util.h> // view_interface
44#include <bits/unicode.h> // __charset_alias_match
45#include <ext/numeric_traits.h> // __int_traits
46
47namespace std _GLIBCXX_VISIBILITY(default)
48{
49_GLIBCXX_BEGIN_NAMESPACE_VERSION
50
51 /**
52 * @brief An interface for accessing the IANA Character Sets registry.
53 * @ingroup locales
54 * @since C++23
55 */
56 struct text_encoding
57 {
58 private:
59 struct _Rep
60 {
61 using id = __INT_LEAST32_TYPE__;
62 id _M_id;
63 const char* _M_name;
64
65 friend constexpr bool
66 operator<(const _Rep& __r, id __m) noexcept
67 { return __r._M_id < __m; }
68
69 friend constexpr bool
70 operator==(const _Rep& __r, string_view __name) noexcept
71 { return __r._M_name == __name; }
72 };
73
74 public:
75 static constexpr size_t max_name_length = 63;
76
77 enum class id : _Rep::id
78 {
79 other = 1,
80 unknown = 2,
81 ASCII = 3,
82 ISOLatin1 = 4,
83 ISOLatin2 = 5,
84 ISOLatin3 = 6,
85 ISOLatin4 = 7,
86 ISOLatinCyrillic = 8,
87 ISOLatinArabic = 9,
88 ISOLatinGreek = 10,
89 ISOLatinHebrew = 11,
90 ISOLatin5 = 12,
91 ISOLatin6 = 13,
92 ISOTextComm = 14,
93 HalfWidthKatakana = 15,
94 JISEncoding = 16,
95 ShiftJIS = 17,
96 EUCPkdFmtJapanese = 18,
97 EUCFixWidJapanese = 19,
98 ISO4UnitedKingdom = 20,
99 ISO11SwedishForNames = 21,
100 ISO15Italian = 22,
101 ISO17Spanish = 23,
102 ISO21German = 24,
103 ISO60DanishNorwegian = 25,
104 ISO69French = 26,
105 ISO10646UTF1 = 27,
106 ISO646basic1983 = 28,
107 INVARIANT = 29,
108 ISO2IntlRefVersion = 30,
109 NATSSEFI = 31,
110 NATSSEFIADD = 32,
111 ISO10Swedish = 35,
112 KSC56011987 = 36,
113 ISO2022KR = 37,
114 EUCKR = 38,
115 ISO2022JP = 39,
116 ISO2022JP2 = 40,
117 ISO13JISC6220jp = 41,
118 ISO14JISC6220ro = 42,
119 ISO16Portuguese = 43,
120 ISO18Greek7Old = 44,
121 ISO19LatinGreek = 45,
122 ISO25French = 46,
123 ISO27LatinGreek1 = 47,
124 ISO5427Cyrillic = 48,
125 ISO42JISC62261978 = 49,
126 ISO47BSViewdata = 50,
127 ISO49INIS = 51,
128 ISO50INIS8 = 52,
129 ISO51INISCyrillic = 53,
130 ISO54271981 = 54,
131 ISO5428Greek = 55,
132 ISO57GB1988 = 56,
133 ISO58GB231280 = 57,
134 ISO61Norwegian2 = 58,
135 ISO70VideotexSupp1 = 59,
136 ISO84Portuguese2 = 60,
137 ISO85Spanish2 = 61,
138 ISO86Hungarian = 62,
139 ISO87JISX0208 = 63,
140 ISO88Greek7 = 64,
141 ISO89ASMO449 = 65,
142 ISO90 = 66,
143 ISO91JISC62291984a = 67,
144 ISO92JISC62991984b = 68,
145 ISO93JIS62291984badd = 69,
146 ISO94JIS62291984hand = 70,
147 ISO95JIS62291984handadd = 71,
148 ISO96JISC62291984kana = 72,
149 ISO2033 = 73,
150 ISO99NAPLPS = 74,
151 ISO102T617bit = 75,
152 ISO103T618bit = 76,
153 ISO111ECMACyrillic = 77,
154 ISO121Canadian1 = 78,
155 ISO122Canadian2 = 79,
156 ISO123CSAZ24341985gr = 80,
157 ISO88596E = 81,
158 ISO88596I = 82,
159 ISO128T101G2 = 83,
160 ISO88598E = 84,
161 ISO88598I = 85,
162 ISO139CSN369103 = 86,
163 ISO141JUSIB1002 = 87,
164 ISO143IECP271 = 88,
165 ISO146Serbian = 89,
166 ISO147Macedonian = 90,
167 ISO150 = 91,
168 ISO151Cuba = 92,
169 ISO6937Add = 93,
170 ISO153GOST1976874 = 94,
171 ISO8859Supp = 95,
172 ISO10367Box = 96,
173 ISO158Lap = 97,
174 ISO159JISX02121990 = 98,
175 ISO646Danish = 99,
176 USDK = 100,
177 DKUS = 101,
178 KSC5636 = 102,
179 Unicode11UTF7 = 103,
180 ISO2022CN = 104,
181 ISO2022CNEXT = 105,
182 UTF8 = 106,
183 ISO885913 = 109,
184 ISO885914 = 110,
185 ISO885915 = 111,
186 ISO885916 = 112,
187 GBK = 113,
188 GB18030 = 114,
189 OSDEBCDICDF0415 = 115,
190 OSDEBCDICDF03IRV = 116,
191 OSDEBCDICDF041 = 117,
192 ISO115481 = 118,
193 KZ1048 = 119,
194 UCS2 = 1000,
195 UCS4 = 1001,
196 UnicodeASCII = 1002,
197 UnicodeLatin1 = 1003,
198 UnicodeJapanese = 1004,
199 UnicodeIBM1261 = 1005,
200 UnicodeIBM1268 = 1006,
201 UnicodeIBM1276 = 1007,
202 UnicodeIBM1264 = 1008,
203 UnicodeIBM1265 = 1009,
204 Unicode11 = 1010,
205 SCSU = 1011,
206 UTF7 = 1012,
207 UTF16BE = 1013,
208 UTF16LE = 1014,
209 UTF16 = 1015,
210 CESU8 = 1016,
211 UTF32 = 1017,
212 UTF32BE = 1018,
213 UTF32LE = 1019,
214 BOCU1 = 1020,
215 UTF7IMAP = 1021,
216 Windows30Latin1 = 2000,
217 Windows31Latin1 = 2001,
218 Windows31Latin2 = 2002,
219 Windows31Latin5 = 2003,
220 HPRoman8 = 2004,
221 AdobeStandardEncoding = 2005,
222 VenturaUS = 2006,
223 VenturaInternational = 2007,
224 DECMCS = 2008,
225 PC850Multilingual = 2009,
226 PC8DanishNorwegian = 2012,
227 PC862LatinHebrew = 2013,
228 PC8Turkish = 2014,
229 IBMSymbols = 2015,
230 IBMThai = 2016,
231 HPLegal = 2017,
232 HPPiFont = 2018,
233 HPMath8 = 2019,
234 HPPSMath = 2020,
235 HPDesktop = 2021,
236 VenturaMath = 2022,
237 MicrosoftPublishing = 2023,
238 Windows31J = 2024,
239 GB2312 = 2025,
240 Big5 = 2026,
241 Macintosh = 2027,
242 IBM037 = 2028,
243 IBM038 = 2029,
244 IBM273 = 2030,
245 IBM274 = 2031,
246 IBM275 = 2032,
247 IBM277 = 2033,
248 IBM278 = 2034,
249 IBM280 = 2035,
250 IBM281 = 2036,
251 IBM284 = 2037,
252 IBM285 = 2038,
253 IBM290 = 2039,
254 IBM297 = 2040,
255 IBM420 = 2041,
256 IBM423 = 2042,
257 IBM424 = 2043,
258 PC8CodePage437 = 2011,
259 IBM500 = 2044,
260 IBM851 = 2045,
261 PCp852 = 2010,
262 IBM855 = 2046,
263 IBM857 = 2047,
264 IBM860 = 2048,
265 IBM861 = 2049,
266 IBM863 = 2050,
267 IBM864 = 2051,
268 IBM865 = 2052,
269 IBM868 = 2053,
270 IBM869 = 2054,
271 IBM870 = 2055,
272 IBM871 = 2056,
273 IBM880 = 2057,
274 IBM891 = 2058,
275 IBM903 = 2059,
276 IBM904 = 2060,
277 IBM905 = 2061,
278 IBM918 = 2062,
279 IBM1026 = 2063,
280 IBMEBCDICATDE = 2064,
281 EBCDICATDEA = 2065,
282 EBCDICCAFR = 2066,
283 EBCDICDKNO = 2067,
284 EBCDICDKNOA = 2068,
285 EBCDICFISE = 2069,
286 EBCDICFISEA = 2070,
287 EBCDICFR = 2071,
288 EBCDICIT = 2072,
289 EBCDICPT = 2073,
290 EBCDICES = 2074,
291 EBCDICESA = 2075,
292 EBCDICESS = 2076,
293 EBCDICUK = 2077,
294 EBCDICUS = 2078,
295 Unknown8BiT = 2079,
296 Mnemonic = 2080,
297 Mnem = 2081,
298 VISCII = 2082,
299 VIQR = 2083,
300 KOI8R = 2084,
301 HZGB2312 = 2085,
302 IBM866 = 2086,
303 PC775Baltic = 2087,
304 KOI8U = 2088,
305 IBM00858 = 2089,
306 IBM00924 = 2090,
307 IBM01140 = 2091,
308 IBM01141 = 2092,
309 IBM01142 = 2093,
310 IBM01143 = 2094,
311 IBM01144 = 2095,
312 IBM01145 = 2096,
313 IBM01146 = 2097,
314 IBM01147 = 2098,
315 IBM01148 = 2099,
316 IBM01149 = 2100,
317 Big5HKSCS = 2101,
318 IBM1047 = 2102,
319 PTCP154 = 2103,
320 Amiga1251 = 2104,
321 KOI7switched = 2105,
322 BRF = 2106,
323 TSCII = 2107,
324 CP51932 = 2108,
325 windows874 = 2109,
326 windows1250 = 2250,
327 windows1251 = 2251,
328 windows1252 = 2252,
329 windows1253 = 2253,
330 windows1254 = 2254,
331 windows1255 = 2255,
332 windows1256 = 2256,
333 windows1257 = 2257,
334 windows1258 = 2258,
335 TIS620 = 2259,
336 CP50220 = 2260
337 };
338 using enum id;
339
340 constexpr text_encoding() = default;
341
342 constexpr explicit
343 text_encoding(string_view __enc) noexcept
344 : _M_rep(_S_find_name(__enc))
345 {
346 __enc.copy(_M_name, max_name_length);
347 }
348
349 // @pre i has the value of one of the enumerators of id.
350 constexpr
351 text_encoding(id __i) noexcept
352 : _M_rep(_S_find_id(__i))
353 {
354 if (string_view __name(_M_rep->_M_name); !__name.empty())
355 __name.copy(_M_name, max_name_length);
356 }
357
358 constexpr id mib() const noexcept { return id(_M_rep->_M_id); }
359
360 constexpr const char* name() const noexcept { return _M_name; }
361
362 struct aliases_view : ranges::view_interface<aliases_view>
363 {
364 private:
365 class _Iterator;
366 struct _Sentinel { };
367
368 public:
369 constexpr _Iterator begin() const noexcept;
370 constexpr _Sentinel end() const noexcept { return {}; }
371
372 private:
373 friend struct text_encoding;
374
375 constexpr explicit aliases_view(const _Rep* __r) : _M_begin(__r) { }
376
377 const _Rep* _M_begin = nullptr;
378 };
379
380 constexpr aliases_view
381 aliases() const noexcept
382 {
383 return _M_rep->_M_name[0] ? aliases_view(_M_rep) : aliases_view{nullptr};
384 }
385
386 friend constexpr bool
387 operator==(const text_encoding& __a,
388 const text_encoding& __b) noexcept
389 {
390 if (__a.mib() == id::other && __b.mib() == id::other) [[unlikely]]
391 return _S_comp(__a._M_name, __b._M_name);
392 else
393 return __a.mib() == __b.mib();
394 }
395
396 friend constexpr bool
397 operator==(const text_encoding& __encoding, id __i) noexcept
398 { return __encoding.mib() == __i; }
399
400#if __CHAR_BIT__ == 8
401 static consteval text_encoding
402 literal() noexcept
403 {
404#ifdef __GNUC_EXECUTION_CHARSET_NAME
405 return text_encoding(__GNUC_EXECUTION_CHARSET_NAME);
406#elif defined __clang_literal_encoding__
407 return text_encoding(__clang_literal_encoding__);
408#else
409 return text_encoding();
410#endif
411 }
412
413 static text_encoding
414 environment();
415
416 template<id _Id>
417 static bool
418 environment_is()
419 { return text_encoding(_Id)._M_is_environment(); }
420#else
421 static text_encoding literal() = delete;
422 static text_encoding environment() = delete;
423 template<id> static bool environment_is() = delete;
424#endif
425
426 private:
427 const _Rep* _M_rep = _S_reps + 1; // id::unknown
428 char _M_name[max_name_length + 1] = {0};
429
430 bool
431 _M_is_environment() const;
432
433 static inline constexpr _Rep _S_reps[] = {
434 { 1, "" }, { 2, "" },
435#define _GLIBCXX_GET_ENCODING_DATA
436#include <bits/text_encoding-data.h>
437#ifdef _GLIBCXX_GET_ENCODING_DATA
438# error "Invalid text_encoding data"
439#endif
440 { 9999, nullptr }, // sentinel
441 };
442
443 static constexpr bool
444 _S_comp(string_view __a, string_view __b)
445 { return __unicode::__charset_alias_match(__a, __b); }
446
447 static constexpr const _Rep*
448 _S_find_name(string_view __name) noexcept
449 {
450#ifdef _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET
451 // Optimize the common UTF-8 case to avoid a linear search through all
452 // strings in the table using the _S_comp function.
453 if (__name == "UTF-8")
454 return _S_reps + 2 + _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET;
455#endif
456
457 // The first two array elements (other and unknown) don't have names.
458 // The last element is a sentinel that can never match anything.
459 const auto __first = _S_reps + 2, __end = std::end(_S_reps) - 1;
460 for (auto __r = __first; __r != __end; ++__r)
461 if (_S_comp(__r->_M_name, __name))
462 {
463 // Might have matched an alias. Find the first entry for this ID.
464 const auto __id = __r->_M_id;
465 while (__r[-1]._M_id == __id)
466 --__r;
467 return __r;
468 }
469 return _S_reps; // id::other
470 }
471
472 static constexpr const _Rep*
473 _S_find_id(id __id) noexcept
474 {
475 const auto __i = (_Rep::id)__id;
476 const auto __r = std::lower_bound(_S_reps, std::end(_S_reps) - 1, __i);
477 if (__r->_M_id == __i) [[likely]]
478 return __r;
479 else
480 {
481 // Preconditions: i has the value of one of the enumerators of id.
482 __glibcxx_assert(__r->_M_id == __i);
483 return _S_reps + 1; // id::unknown
484 }
485 }
486 };
487
488 template<>
489 struct hash<text_encoding>
490 {
491 size_t
492 operator()(const text_encoding& __enc) const noexcept
493 { return std::hash<text_encoding::id>()(__enc.mib()); }
494 };
495
496 class text_encoding::aliases_view::_Iterator
497 {
498 public:
499 using value_type = const char*;
500 using reference = const char*;
501 using difference_type = int;
502
503 constexpr _Iterator() = default;
504
505 constexpr value_type
506 operator*() const
507 {
508 if (_M_dereferenceable()) [[likely]]
509 return _M_rep->_M_name;
510 else
511 {
512 __glibcxx_assert(_M_dereferenceable());
513 return "";
514 }
515 }
516
517 constexpr _Iterator&
518 operator++()
519 {
520 if (_M_dereferenceable()) [[likely]]
521 ++_M_rep;
522 else
523 {
524 __glibcxx_assert(_M_dereferenceable());
525 *this = _Iterator{};
526 }
527 return *this;
528 }
529
530 constexpr _Iterator&
531 operator--()
532 {
533 const bool __decrementable
534 = _M_rep != nullptr && _M_rep[-1]._M_id == _M_id;
535 if (__decrementable) [[likely]]
536 --_M_rep;
537 else
538 {
539 __glibcxx_assert(__decrementable);
540 *this = _Iterator{};
541 }
542 return *this;
543 }
544
545 constexpr _Iterator
546 operator++(int)
547 {
548 auto __it = *this;
549 ++*this;
550 return __it;
551 }
552
553 constexpr _Iterator
554 operator--(int)
555 {
556 auto __it = *this;
557 --*this;
558 return __it;
559 }
560
561 constexpr value_type
562 operator[](difference_type __n) const
563 { return *(*this + __n); }
564
565 constexpr _Iterator&
566 operator+=(difference_type __n)
567 {
568 if (_M_rep != nullptr)
569 {
570 if (__n > 0)
571 {
572 if (__n < (std::end(_S_reps) - _M_rep)
573 && _M_rep[__n - 1]._M_id == _M_id) [[likely]]
574 _M_rep += __n;
575 else
576 *this = _Iterator{};
577 }
578 else if (__n < 0)
579 {
580 if (__n > (_S_reps - _M_rep)
581 && _M_rep[__n]._M_id == _M_id) [[likely]]
582 _M_rep += __n;
583 else
584 *this = _Iterator{};
585 }
586 }
587 if (__n != 0)
588 __glibcxx_assert(_M_rep != nullptr);
589 return *this;
590 }
591
592 constexpr _Iterator&
593 operator-=(difference_type __n)
594 {
595 using _Traits = __gnu_cxx::__int_traits<difference_type>;
596 if (__n == _Traits::__min) [[unlikely]]
597 return operator+=(_Traits::__max);
598 return operator+=(-__n);
599 }
600
601 constexpr difference_type
602 operator-(const _Iterator& __i) const
603 {
604 if (_M_id == __i._M_id)
605 return _M_rep - __i._M_rep;
606 __glibcxx_assert(_M_id == __i._M_id);
607 return __gnu_cxx::__int_traits<difference_type>::__max;
608 }
609
610 constexpr bool
611 operator==(const _Iterator&) const = default;
612
613 constexpr bool
614 operator==(_Sentinel) const noexcept
615 { return !_M_dereferenceable(); }
616
617 constexpr strong_ordering
618 operator<=>(const _Iterator& __i) const
619 {
620 __glibcxx_assert(_M_id == __i._M_id);
621 return _M_rep <=> __i._M_rep;
622 }
623
624 friend constexpr _Iterator
625 operator+(_Iterator __i, difference_type __n)
626 {
627 __i += __n;
628 return __i;
629 }
630
631 friend constexpr _Iterator
632 operator+(difference_type __n, _Iterator __i)
633 {
634 __i += __n;
635 return __i;
636 }
637
638 friend constexpr _Iterator
639 operator-(_Iterator __i, difference_type __n)
640 {
641 __i -= __n;
642 return __i;
643 }
644
645 private:
646 friend struct text_encoding;
647
648 constexpr explicit
649 _Iterator(const _Rep* __r) noexcept
650 : _M_rep(__r), _M_id(__r ? __r->_M_id : 0)
651 { }
652
653 constexpr bool
654 _M_dereferenceable() const noexcept
655 { return _M_rep != nullptr && _M_rep->_M_id == _M_id; }
656
657 const _Rep* _M_rep = nullptr;
658 _Rep::id _M_id = 0;
659 };
660
661 constexpr auto
662 text_encoding::aliases_view::begin() const noexcept
663 -> _Iterator
664 { return _Iterator(_M_begin); }
665
666namespace ranges
667{
668 // Opt-in to borrowed_range concept
669 template<>
670 inline constexpr bool
671 enable_borrowed_range<std::text_encoding::aliases_view> = true;
672}
673
674_GLIBCXX_END_NAMESPACE_VERSION
675} // namespace std
676
677#endif // __cpp_lib_text_encoding
678#endif // _GLIBCXX_TEXT_ENCODING