dgl.dml.lexer source code

1 /*
2 Copyright (c) 2015 Timur Gafarov 
3 
4 Boost Software License - Version 1.0 - August 17th, 2003
5 
6 Permission is hereby granted, free of charge, to any person or organization
7 obtaining a copy of the software and accompanying documentation covered by
8 this license (the "Software") to use, reproduce, display, distribute,
9 execute, and transmit the Software, and to prepare derivative works of the
10 Software, and to permit third-parties to whom the Software is furnished to
11 do so, all subject to the following:
12 
13 The copyright notices in the Software and this entire statement, including
14 the above license grant, this restriction and the following disclaimer,
15 must be included in all copies of the Software, in whole or in part, and
16 all derivative works of the Software, unless such copies or derivative
17 works are solely in the form of machine-executable object code generated by
18 a source language processor.
19 
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
23 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
24 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
25 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE.
27 */
28 
29 module dgl.dml.lexer;
30 
31 import std.stdio;
32 import std.algorithm;
33 import std.ascii;
34 import dlib.container.array;
35 import dgl.dml.utf8;
36 
37 struct Lexeme
38 {
39     bool valid = true;
40     DynamicArray!(dchar, 32) str;
41 
42     void free()
43     {
44         str.free();
45     }
46 }
47 
48 enum
49 {
50     FULL_MATCH,
51     PARTIAL_MATCH,
52     CANDIDATE_MATCH,
53     NO_MATCH
54 }
55 
56 Lexeme emptyLexeme()
57 {
58     Lexeme lexeme;
59     return lexeme;
60 }
61 
62 Lexeme invalidLexeme()
63 {
64     Lexeme lexeme;
65     lexeme.valid = false;
66     return lexeme;
67 }
68 
69 //version = LexerDebug;
70 
71 /*
72  * GC-free, Unicode-aware lexical analyzer.
73  * Assumes ASCII or UTF-8 input. Outputs arrays of dchars as lexemes.
74  * Reads character stream one-by-one.
75  * Treats LF (\n) as a newline character on all systems
76  * (compatible with Windows and all Unices).
77  * Implemented as a chain of filters
78  * (stream -> basic lexemes -> with strings -> filtered comments).
79  */
80 
81 struct BaseFilter
82 {
83     UTF8Decoder dec;
84     string[] delimiters;
85     uint line = 1;
86 
87     this(string str)
88     {
89         this.dec = UTF8Decoder(str);
90     }
91 
92     Lexeme lexeme1;
93     Lexeme lexeme2;
94     bool nextIsMatch = false;
95     int initNextLexeme2 = -1;
96     Lexeme get()
97     {
98         if (lexeme1.str.data.length)
99         {
100             auto res = lexeme1;
101             lexeme1 = emptyLexeme();
102             return res;
103         }
104 
105         int c = 0;
106         while (c != UTF8_END && c != UTF8_ERROR)
107         {
108             if (initNextLexeme2 >= 0)
109             {
110                 c = initNextLexeme2;
111                 initNextLexeme2 = -1;
112             }
113             else
114             {
115                 c = dec.decodeNext();
116                 if (cast(dchar)c == '\n')
117                     line++;
118             }
119 
120             if (c < 0) break;
121 
122             lexeme2.str.append(cast(dchar)c);
123 
124             uint m = match(lexeme2);
125             if (m == FULL_MATCH)
126             {
127                 nextIsMatch = false;
128                 version(LexerDebug) writefln("FULL_MATCH for %s", lexeme2.str.data);
129                 
130                 if (lexeme1.str.data.length)
131                 {
132                     auto res = lexeme1;
133                     lexeme1 = lexeme2;
134                     lexeme2 = emptyLexeme();
135                     return res;
136                 }
137                 else
138                 {
139                     auto res = lexeme2;
140                     lexeme2 = emptyLexeme();
141                     return res;
142                 }
143             }
144             else if (m == PARTIAL_MATCH)
145             {
146                 version(LexerDebug) writefln("PARTIAL_MATCH for %s", lexeme2.str.data);
147             }
148             else if (m == CANDIDATE_MATCH)
149             {
150                 version(LexerDebug) writefln("CANDIDATE_MATCH for %s", lexeme2.str.data);
151                 nextIsMatch = true;
152             }
153             else if (m == NO_MATCH)
154             {
155                 version(LexerDebug) writefln("NO_MATCH for %s", lexeme2.str.data);
156 
157                 if (nextIsMatch)
158                 {
159                     version(LexerDebug) writeln("case 0");
160                     nextIsMatch = false;
161                     auto last = lexeme2.str.data[$-1];
162                     lexeme2.str.remove(1);
163 
164                     if (lexeme1.str.data.length)
165                     {
166                         version(LexerDebug) writeln("case 1");
167                         auto res = lexeme1;
168                         lexeme1 = lexeme2;
169                         lexeme2 = emptyLexeme();
170                         initNextLexeme2 = last;
171                         return res;
172                     }
173                     else
174                     {
175                         version(LexerDebug) writeln("case 2");
176                         auto res = lexeme2;
177                         lexeme2 = emptyLexeme();
178                         initNextLexeme2 = last;
179                         return res;
180                     }
181                 }
182                 else
183                 {
184                     version(LexerDebug) writeln("case 3");
185                     lexeme1.str.append(lexeme2.str.data);
186                     lexeme2.str.free();
187                 }
188             }
189         }
190 
191         auto res = lexeme1;
192         if (lexeme2.str.data.length)
193         {
194             if (!res.str.data.length)
195             {
196                 res = lexeme2;
197             }
198             else
199             {
200                 lexeme1 = lexeme2;
201             }
202             lexeme2 = emptyLexeme();
203         }
204         else
205         {
206             lexeme1 = emptyLexeme();
207         }
208 
209         if (!res.str.data.length)
210             res.valid = false;
211 
212         return res;
213     }
214 
215     uint match(Lexeme lexeme)
216     {
217         bool partialMatch = false;
218         foreach(d; delimiters)
219         {
220             auto s = lexeme.str.data;
221             size_t delimLen = 0;
222             size_t matchedLen = 0;
223             bool counting = true;
224             foreach(i, c; d)
225             {
226                 delimLen++;
227                 if (i < s.length)
228                 {
229                     if (s[i] == c)
230                     {
231                         if (counting)
232                             matchedLen++;
233                     }
234                     else
235                         counting = false;
236                 }
237             }
238             if (matchedLen == delimLen && matchedLen == s.length)
239             {
240                 if (!partialMatch)
241                     return FULL_MATCH;
242                 else
243                     return CANDIDATE_MATCH;
244             }
245             else if (matchedLen == s.length)
246                 partialMatch = true;
247         }
248         
249         if (isWhite(lexeme.str.data[0]))
250         {
251             return FULL_MATCH;
252         }
253 
254         if (partialMatch)
255             return PARTIAL_MATCH;
256         else
257             return NO_MATCH;
258     }
259 }
260 
261 static string[] stddelimiters = 
262 [
263     "==","!=","<=",">=","+=","-=","*=","/=",
264     "++","--","||","&&","<<",">>","<>",
265     "//","/*","*/","\\\\","\\\"","\\\'",
266     "+","-","*","/","%","=","|","^","~","<",">","!",
267     "(",")","{","}","[","]",
268     ";",":",",","@","#","$","&",
269     "\\","\"","\'"
270 ];
271 
272 struct StringFilter
273 {
274     BaseFilter lexer;
275     bool readingString = false;
276     Lexeme strLexeme;
277     dchar currentStrChar = 0;
278     
279     this(string str)
280     {
281         lexer = BaseFilter(str);
282         lexer.delimiters = stddelimiters;
283         sort!("a.length > b.length")(lexer.delimiters);
284     }
285     
286     Lexeme get()
287     {
288         Lexeme lexeme;
289         do
290         {
291             lexeme = lexer.get();
292             if (lexeme.valid)
293             {
294                 if (readingString)
295                 {
296                     strLexeme.str.append(lexeme.str.data);
297                 }
298                     
299                 if (isStringChar(lexeme))
300                 {
301                     if (readingString)
302                     {
303                         readingString = false;
304                         auto res = strLexeme;
305                         strLexeme = emptyLexeme();
306                         currentStrChar = 0;
307                         return res;
308                     }
309                     else
310                     {
311                         currentStrChar = lexeme.str.data[0];
312                         readingString = true;
313                         strLexeme.str.append(lexeme.str.data);
314                         continue;
315                     }
316                 }
317                 else
318                 {
319                     if (isWhite(lexeme.str.data[0]))
320                     {
321                         if (lexeme.str.data[0] == '\n')
322                             return lexeme;
323                         else
324                             continue;
325                     }
326                     else
327                     {
328                         if (readingString)
329                             continue;
330                         else
331                             return lexeme;
332                     }
333                 }
334             }
335             else
336                 return lexeme;
337         }
338         while(lexeme.valid);
339         return lexeme;
340     }
341     
342     // TODO: support multi-char string marks
343     bool isStringChar(Lexeme lexeme)
344     {
345         if (currentStrChar == 0)
346             return 
347                 lexeme.str.data[0] == '"' ||
348                 lexeme.str.data[0] == '\'';
349         else
350             return lexeme.str.data[0] == currentStrChar;
351     }
352 
353     uint line()
354     {
355         return lexer.line;
356     }
357 }
358 
359 // TODO: make this configurable
360 struct CommentFilter
361 {
362     StringFilter lexer;
363     bool readingComment = false;
364     
365     this(string str)
366     {
367         lexer = StringFilter(str);
368     }
369     
370     Lexeme get()
371     {
372         Lexeme lexeme;
373         do
374         {
375             lexeme = lexer.get();
376             if (lexeme.valid)
377             {
378                 if (lexeme.str.data[0] == '#')
379                 {
380                     readingComment = true;
381                     continue;
382                 }
383                 else
384                 {
385                     if (lexeme.str.data[0] == '\n')
386                     {
387                         if (readingComment)
388                             readingComment = false;
389                         continue;
390                     }
391                     else
392                     {
393                         if (readingComment)
394                             continue;
395                         else
396                             return lexeme;
397                     }
398                 }
399             }
400             else
401                 return lexeme;
402         }
403         while(lexeme.valid);
404         return lexeme;
405     }
406 
407     uint line()
408     {
409         return lexer.line;
410     }
411 }
412 
413 struct Lexer
414 {
415     CommentFilter lexer;
416     Lexeme current;
417 
418     this(string str)
419     {
420         lexer = CommentFilter(str);
421     }
422 
423     Lexeme get()
424     {
425         current = lexer.get();
426         return current;
427     }
428 
429     uint line()
430     {
431         return lexer.line;
432     }
433 }
434