1 /* 2 Copyright (c) 2015 Timur Gafarov 3 4 Boost Software License - Version 1.0 - August 17th, 2003 5 6 Permission is hereby granted, free of charge, to any person or organization 7 obtaining a copy of the software and accompanying documentation covered by 8 this license (the "Software") to use, reproduce, display, distribute, 9 execute, and transmit the Software, and to prepare derivative works of the 10 Software, and to permit third-parties to whom the Software is furnished to 11 do so, all subject to the following: 12 13 The copyright notices in the Software and this entire statement, including 14 the above license grant, this restriction and the following disclaimer, 15 must be included in all copies of the Software, in whole or in part, and 16 all derivative works of the Software, unless such copies or derivative 17 works are solely in the form of machine-executable object code generated by 18 a source language processor. 19 20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 23 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 24 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 25 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 26 DEALINGS IN THE SOFTWARE. 27 */ 28 29 module dgl.dml.utf8; 30 31 /* 32 * Simple and pretty fast UTF-8 decoder 33 */ 34 35 enum UTF8_END = -1; 36 enum UTF8_ERROR = -2; 37 38 struct UTF8Decoder 39 { 40 size_t index = 0; 41 int character = 0; 42 string input; 43 44 int get() 45 { 46 if (index >= input.length) 47 return UTF8_END; 48 auto c = input[index] & 0xFF; 49 index++; 50 return c; 51 } 52 53 int cont() 54 { 55 int c = get(); 56 return ((c & 0xC0) == 0x80) ? (c & 0x3F): UTF8_ERROR; 57 } 58 59 this(string str) 60 { 61 input = str; 62 } 63 64 int decodeNext() 65 { 66 int c; // the first byte of the character 67 int r; // the result 68 69 if (index >= input.length) 70 return index == input.length ? UTF8_END : UTF8_ERROR; 71 72 character++; 73 c = get(); 74 75 // Zero continuation (0 to 127) 76 if ((c & 0x80) == 0) 77 return c; 78 79 // One continuation (128 to 2047) 80 if ((c & 0xE0) == 0xC0) 81 { 82 int c1 = cont(); 83 if (c1 >= 0) 84 { 85 r = ((c & 0x1F) << 6) | c1; 86 return r >= 128 ? r : UTF8_ERROR; 87 } 88 } 89 // Two continuation (2048 to 55295 and 57344 to 65535) 90 else if ((c & 0xF0) == 0xE0) 91 { 92 int c1 = cont(); 93 int c2 = cont(); 94 if ((c1 | c2) >= 0) 95 { 96 r = ((c & 0x0F) << 12) | (c1 << 6) | c2; 97 return r >= 2048 && (r < 55296 || r > 57343) ? r : UTF8_ERROR; 98 } 99 } 100 // Three continuation (65536 to 1114111) 101 else if ((c & 0xF8) == 0xF0) 102 { 103 int c1 = cont(); 104 int c2 = cont(); 105 int c3 = cont(); 106 if ((c1 | c2 | c3) >= 0) 107 { 108 return (((c & 0x0F) << 18) | (c1 << 12) | (c2 << 6) | c3) + 65536; 109 } 110 } 111 112 return UTF8_ERROR; 113 } 114 } 115