1 /* 2 Copyright (c) 2015 Timur Gafarov 3 4 Boost Software License - Version 1.0 - August 17th, 2003 5 6 Permission is hereby granted, free of charge, to any person or organization 7 obtaining a copy of the software and accompanying documentation covered by 8 this license (the "Software") to use, reproduce, display, distribute, 9 execute, and transmit the Software, and to prepare derivative works of the 10 Software, and to permit third-parties to whom the Software is furnished to 11 do so, all subject to the following: 12 13 The copyright notices in the Software and this entire statement, including 14 the above license grant, this restriction and the following disclaimer, 15 must be included in all copies of the Software, in whole or in part, and 16 all derivative works of the Software, unless such copies or derivative 17 works are solely in the form of machine-executable object code generated by 18 a source language processor. 19 20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 23 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 24 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 25 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 26 DEALINGS IN THE SOFTWARE. 27 */ 28 29 module dgl.dml.lexer; 30 31 import std.stdio; 32 import std.algorithm; 33 import std.ascii; 34 import dlib.container.array; 35 import dgl.dml.utf8; 36 37 struct Lexeme 38 { 39 bool valid = true; 40 DynamicArray!(dchar, 32) str; 41 42 void free() 43 { 44 str.free(); 45 } 46 } 47 48 enum 49 { 50 FULL_MATCH, 51 PARTIAL_MATCH, 52 CANDIDATE_MATCH, 53 NO_MATCH 54 } 55 56 Lexeme emptyLexeme() 57 { 58 Lexeme lexeme; 59 return lexeme; 60 } 61 62 Lexeme invalidLexeme() 63 { 64 Lexeme lexeme; 65 lexeme.valid = false; 66 return lexeme; 67 } 68 69 //version = LexerDebug; 70 71 /* 72 * GC-free, Unicode-aware lexical analyzer. 73 * Assumes ASCII or UTF-8 input. Outputs arrays of dchars as lexemes. 74 * Reads character stream one-by-one. 75 * Treats LF (\n) as a newline character on all systems 76 * (compatible with Windows and all Unices). 77 * Implemented as a chain of filters 78 * (stream -> basic lexemes -> with strings -> filtered comments). 79 */ 80 81 struct BaseFilter 82 { 83 UTF8Decoder dec; 84 string[] delimiters; 85 uint line = 1; 86 87 this(string str) 88 { 89 this.dec = UTF8Decoder(str); 90 } 91 92 Lexeme lexeme1; 93 Lexeme lexeme2; 94 bool nextIsMatch = false; 95 int initNextLexeme2 = -1; 96 Lexeme get() 97 { 98 if (lexeme1.str.data.length) 99 { 100 auto res = lexeme1; 101 lexeme1 = emptyLexeme(); 102 return res; 103 } 104 105 int c = 0; 106 while (c != UTF8_END && c != UTF8_ERROR) 107 { 108 if (initNextLexeme2 >= 0) 109 { 110 c = initNextLexeme2; 111 initNextLexeme2 = -1; 112 } 113 else 114 { 115 c = dec.decodeNext(); 116 if (cast(dchar)c == '\n') 117 line++; 118 } 119 120 if (c < 0) break; 121 122 lexeme2.str.append(cast(dchar)c); 123 124 uint m = match(lexeme2); 125 if (m == FULL_MATCH) 126 { 127 nextIsMatch = false; 128 version(LexerDebug) writefln("FULL_MATCH for %s", lexeme2.str.data); 129 130 if (lexeme1.str.data.length) 131 { 132 auto res = lexeme1; 133 lexeme1 = lexeme2; 134 lexeme2 = emptyLexeme(); 135 return res; 136 } 137 else 138 { 139 auto res = lexeme2; 140 lexeme2 = emptyLexeme(); 141 return res; 142 } 143 } 144 else if (m == PARTIAL_MATCH) 145 { 146 version(LexerDebug) writefln("PARTIAL_MATCH for %s", lexeme2.str.data); 147 } 148 else if (m == CANDIDATE_MATCH) 149 { 150 version(LexerDebug) writefln("CANDIDATE_MATCH for %s", lexeme2.str.data); 151 nextIsMatch = true; 152 } 153 else if (m == NO_MATCH) 154 { 155 version(LexerDebug) writefln("NO_MATCH for %s", lexeme2.str.data); 156 157 if (nextIsMatch) 158 { 159 version(LexerDebug) writeln("case 0"); 160 nextIsMatch = false; 161 auto last = lexeme2.str.data[$-1]; 162 lexeme2.str.remove(1); 163 164 if (lexeme1.str.data.length) 165 { 166 version(LexerDebug) writeln("case 1"); 167 auto res = lexeme1; 168 lexeme1 = lexeme2; 169 lexeme2 = emptyLexeme(); 170 initNextLexeme2 = last; 171 return res; 172 } 173 else 174 { 175 version(LexerDebug) writeln("case 2"); 176 auto res = lexeme2; 177 lexeme2 = emptyLexeme(); 178 initNextLexeme2 = last; 179 return res; 180 } 181 } 182 else 183 { 184 version(LexerDebug) writeln("case 3"); 185 lexeme1.str.append(lexeme2.str.data); 186 lexeme2.str.free(); 187 } 188 } 189 } 190 191 auto res = lexeme1; 192 if (lexeme2.str.data.length) 193 { 194 if (!res.str.data.length) 195 { 196 res = lexeme2; 197 } 198 else 199 { 200 lexeme1 = lexeme2; 201 } 202 lexeme2 = emptyLexeme(); 203 } 204 else 205 { 206 lexeme1 = emptyLexeme(); 207 } 208 209 if (!res.str.data.length) 210 res.valid = false; 211 212 return res; 213 } 214 215 uint match(Lexeme lexeme) 216 { 217 bool partialMatch = false; 218 foreach(d; delimiters) 219 { 220 auto s = lexeme.str.data; 221 size_t delimLen = 0; 222 size_t matchedLen = 0; 223 bool counting = true; 224 foreach(i, c; d) 225 { 226 delimLen++; 227 if (i < s.length) 228 { 229 if (s[i] == c) 230 { 231 if (counting) 232 matchedLen++; 233 } 234 else 235 counting = false; 236 } 237 } 238 if (matchedLen == delimLen && matchedLen == s.length) 239 { 240 if (!partialMatch) 241 return FULL_MATCH; 242 else 243 return CANDIDATE_MATCH; 244 } 245 else if (matchedLen == s.length) 246 partialMatch = true; 247 } 248 249 if (isWhite(lexeme.str.data[0])) 250 { 251 return FULL_MATCH; 252 } 253 254 if (partialMatch) 255 return PARTIAL_MATCH; 256 else 257 return NO_MATCH; 258 } 259 } 260 261 static string[] stddelimiters = 262 [ 263 "==","!=","<=",">=","+=","-=","*=","/=", 264 "++","--","||","&&","<<",">>","<>", 265 "//","/*","*/","\\\\","\\\"","\\\'", 266 "+","-","*","/","%","=","|","^","~","<",">","!", 267 "(",")","{","}","[","]", 268 ";",":",",","@","#","$","&", 269 "\\","\"","\'" 270 ]; 271 272 struct StringFilter 273 { 274 BaseFilter lexer; 275 bool readingString = false; 276 Lexeme strLexeme; 277 dchar currentStrChar = 0; 278 279 this(string str) 280 { 281 lexer = BaseFilter(str); 282 lexer.delimiters = stddelimiters; 283 sort!("a.length > b.length")(lexer.delimiters); 284 } 285 286 Lexeme get() 287 { 288 Lexeme lexeme; 289 do 290 { 291 lexeme = lexer.get(); 292 if (lexeme.valid) 293 { 294 if (readingString) 295 { 296 strLexeme.str.append(lexeme.str.data); 297 } 298 299 if (isStringChar(lexeme)) 300 { 301 if (readingString) 302 { 303 readingString = false; 304 auto res = strLexeme; 305 strLexeme = emptyLexeme(); 306 currentStrChar = 0; 307 return res; 308 } 309 else 310 { 311 currentStrChar = lexeme.str.data[0]; 312 readingString = true; 313 strLexeme.str.append(lexeme.str.data); 314 continue; 315 } 316 } 317 else 318 { 319 if (isWhite(lexeme.str.data[0])) 320 { 321 if (lexeme.str.data[0] == '\n') 322 return lexeme; 323 else 324 continue; 325 } 326 else 327 { 328 if (readingString) 329 continue; 330 else 331 return lexeme; 332 } 333 } 334 } 335 else 336 return lexeme; 337 } 338 while(lexeme.valid); 339 return lexeme; 340 } 341 342 // TODO: support multi-char string marks 343 bool isStringChar(Lexeme lexeme) 344 { 345 if (currentStrChar == 0) 346 return 347 lexeme.str.data[0] == '"' || 348 lexeme.str.data[0] == '\''; 349 else 350 return lexeme.str.data[0] == currentStrChar; 351 } 352 353 uint line() 354 { 355 return lexer.line; 356 } 357 } 358 359 // TODO: make this configurable 360 struct CommentFilter 361 { 362 StringFilter lexer; 363 bool readingComment = false; 364 365 this(string str) 366 { 367 lexer = StringFilter(str); 368 } 369 370 Lexeme get() 371 { 372 Lexeme lexeme; 373 do 374 { 375 lexeme = lexer.get(); 376 if (lexeme.valid) 377 { 378 if (lexeme.str.data[0] == '#') 379 { 380 readingComment = true; 381 continue; 382 } 383 else 384 { 385 if (lexeme.str.data[0] == '\n') 386 { 387 if (readingComment) 388 readingComment = false; 389 continue; 390 } 391 else 392 { 393 if (readingComment) 394 continue; 395 else 396 return lexeme; 397 } 398 } 399 } 400 else 401 return lexeme; 402 } 403 while(lexeme.valid); 404 return lexeme; 405 } 406 407 uint line() 408 { 409 return lexer.line; 410 } 411 } 412 413 struct Lexer 414 { 415 CommentFilter lexer; 416 Lexeme current; 417 418 this(string str) 419 { 420 lexer = CommentFilter(str); 421 } 422 423 Lexeme get() 424 { 425 current = lexer.get(); 426 return current; 427 } 428 429 uint line() 430 { 431 return lexer.line; 432 } 433 } 434