fork download
  1. #include <iostream>
  2. #include <string>
  3. #include <vector>
  4.  
  5. std::u32string utf8_to_utf32(const std::string& str) {
  6. std::u32string result;
  7. size_t i = 0;
  8.  
  9. while (i < str.size()) {
  10. uint32_t codepoint = 0;
  11. unsigned char c = str[i];
  12.  
  13. if ((c & 0x80) == 0) { // 1-byte (ASCII)
  14. codepoint = c;
  15. i += 1;
  16. } else if ((c & 0xE0) == 0xC0) { // 2-byte
  17. codepoint = ((c & 0x1F) << 6) | (str[i + 1] & 0x3F);
  18. i += 2;
  19. } else if ((c & 0xF0) == 0xE0) { // 3-byte
  20. codepoint = ((c & 0x0F) << 12) | ((str[i + 1] & 0x3F) << 6) | (str[i + 2] & 0x3F);
  21. i += 3;
  22. } else if ((c & 0xF8) == 0xF0) { // 4-byte
  23. codepoint = ((c & 0x07) << 18) | ((str[i + 1] & 0x3F) << 12) |
  24. ((str[i + 2] & 0x3F) << 6) | (str[i + 3] & 0x3F);
  25. i += 4;
  26. } else {
  27. throw std::runtime_error("Invalid UTF-8 sequence");
  28. }
  29.  
  30. result.push_back(codepoint);
  31. }
  32.  
  33. return result;
  34. }
  35.  
  36. std::wstring utf32_to_utf16(const std::u32string& str) {
  37. std::wstring result;
  38. for (char32_t codepoint : str) {
  39. if (codepoint <= 0xFFFF) { // BMP (Basic Multilingual Plane)
  40. result.push_back(static_cast<wchar_t>(codepoint));
  41. } else { // Surrogate pair needed
  42. codepoint -= 0x10000;
  43. wchar_t high_surrogate = static_cast<wchar_t>((codepoint >> 10) + 0xD800);
  44. wchar_t low_surrogate = static_cast<wchar_t>((codepoint & 0x3FF) + 0xDC00);
  45. result.push_back(high_surrogate);
  46. result.push_back(low_surrogate);
  47. }
  48. }
  49. return result;
  50. }
  51.  
  52. int main() {
  53. std::string utf8_text = "A – B";
  54. std::u32string unicode_text = utf8_to_utf32(utf8_text); // UTF-8 → UTF-32
  55. std::wstring wide_text = utf32_to_utf16(unicode_text); // UTF-32 → UTF-16
  56.  
  57. std::wcout << L"UTF-16 wstring: " << wide_text << std::endl;
  58. return 0;
  59. }
Success #stdin #stdout 0.01s 5292KB
stdin
Standard input is empty
stdout
UTF-16 wstring: A - B