1 /*
2 Copyright (c) 2015 Timur Gafarov 
3 
4 Boost Software License - Version 1.0 - August 17th, 2003
5 
6 Permission is hereby granted, free of charge, to any person or organization
7 obtaining a copy of the software and accompanying documentation covered by
8 this license (the "Software") to use, reproduce, display, distribute,
9 execute, and transmit the Software, and to prepare derivative works of the
10 Software, and to permit third-parties to whom the Software is furnished to
11 do so, all subject to the following:
12 
13 The copyright notices in the Software and this entire statement, including
14 the above license grant, this restriction and the following disclaimer,
15 must be included in all copies of the Software, in whole or in part, and
16 all derivative works of the Software, unless such copies or derivative
17 works are solely in the form of machine-executable object code generated by
18 a source language processor.
19 
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
23 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
24 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
25 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE.
27 */
28 
29 module dgl.dml.utf8;
30 
31 /*
32  * Simple and pretty fast UTF-8 decoder
33  */
34 
35 enum UTF8_END = -1;
36 enum UTF8_ERROR = -2;
37 
38 struct UTF8Decoder
39 {
40     size_t index = 0;
41     int character = 0;
42     string input;
43 
44     int get()
45     {
46         if (index >= input.length)
47             return UTF8_END;
48         auto c = input[index] & 0xFF;
49         index++;
50         return c;
51     }
52 
53     int cont()
54     {
55         int c = get();
56         return ((c & 0xC0) == 0x80) ? (c & 0x3F): UTF8_ERROR;
57     }
58 
59     this(string str)
60     {
61         input = str;
62     }
63 
64     int decodeNext()
65     {
66         int c;  // the first byte of the character
67         int r;  // the result
68 
69         if (index >= input.length)
70             return index == input.length ? UTF8_END : UTF8_ERROR;
71 
72         character++;
73         c = get();
74 
75         // Zero continuation (0 to 127)
76         if ((c & 0x80) == 0)
77             return c;
78 
79         // One continuation (128 to 2047)
80         if ((c & 0xE0) == 0xC0)
81         {
82             int c1 = cont();
83             if (c1 >= 0)
84             {
85                 r = ((c & 0x1F) << 6) | c1;
86                 return r >= 128 ? r : UTF8_ERROR;
87             }
88         }
89         // Two continuation (2048 to 55295 and 57344 to 65535)
90         else if ((c & 0xF0) == 0xE0)
91         {
92             int c1 = cont();
93             int c2 = cont();
94             if ((c1 | c2) >= 0)
95             {
96                 r = ((c & 0x0F) << 12) | (c1 << 6) | c2;
97                 return r >= 2048 && (r < 55296 || r > 57343) ? r : UTF8_ERROR;
98             }
99         }
100         // Three continuation (65536 to 1114111)
101         else if ((c & 0xF8) == 0xF0)
102         {
103             int c1 = cont();
104             int c2 = cont();
105             int c3 = cont();
106             if ((c1 | c2 | c3) >= 0)
107             {
108                 return (((c & 0x0F) << 18) | (c1 << 12) | (c2 << 6) | c3) + 65536;
109             }
110         }
111 
112         return UTF8_ERROR;
113     }
114 }
115