1 /* 2 * Hunt - A refined core library for D programming language. 3 * 4 * Copyright (C) 2018-2019 HuntLabs 5 * 6 * Website: https://www.huntlabs.net/ 7 * 8 * Licensed under the Apache-2.0 License. 9 * 10 */ 11 12 module hunt.text.QuotedStringTokenizer; 13 14 import std.conv; 15 import std.ascii; 16 import std.string; 17 18 import hunt.collection.StringBuffer; 19 import hunt.text.StringTokenizer; 20 import hunt.text.Common; 21 import hunt.text.StringBuilder; 22 import hunt.Exceptions; 23 import hunt.util.Common; 24 import hunt.util.ConverterUtils; 25 26 27 /** 28 * StringTokenizer with Quoting support. 29 * 30 * This class is a copy of the java.util.StringTokenizer API and the behaviour 31 * is the same, except that single and double quoted string values are 32 * recognised. Delimiters within quotes are not considered delimiters. Quotes 33 * can be escaped with '\'. 34 * 35 * @see java.util.StringTokenizer 36 * 37 */ 38 class QuotedStringTokenizer : StringTokenizer { 39 private enum string __delim = "\t\n\r"; 40 private string _string; 41 private string _delim = __delim; 42 private bool _returnQuotes = false; 43 private bool _returnDelimiters = false; 44 private StringBuffer _token; 45 private bool _hasToken = false; 46 private int _i = 0; 47 private int _lastStart = 0; 48 private bool _double = true; 49 private bool _single = true; 50 51 this(string str, string delim, bool returnDelimiters, bool returnQuotes) { 52 super(""); 53 _string = str; 54 if (delim !is null) 55 _delim = delim; 56 _returnDelimiters = returnDelimiters; 57 _returnQuotes = returnQuotes; 58 59 if (_delim.indexOf('\'') >= 0 || _delim.indexOf('"') >= 0) 60 throw new Error("Can't use quotes as delimiters: " ~ _delim); 61 62 _token = new StringBuffer(_string.length > 1024 ? 512 : _string.length / 2); 63 } 64 65 this(string str, string delim, bool returnDelimiters) { 66 this(str, delim, returnDelimiters, false); 67 } 68 69 this(string str, string delim) { 70 this(str, delim, false, false); 71 } 72 73 this(string str) { 74 this(str, null, false, false); 75 } 76 77 override 78 bool hasMoreTokens() { 79 // Already found a token 80 if (_hasToken) 81 return true; 82 83 _lastStart = _i; 84 85 int state = 0; 86 bool escape = false; 87 while (_i < _string.length) { 88 char c = _string.charAt(_i++); 89 90 switch (state) { 91 case 0: // Start 92 if (_delim.indexOf(c) >= 0) { 93 if (_returnDelimiters) { 94 _token.append(c); 95 return _hasToken = true; 96 } 97 } else if (c == '\'' && _single) { 98 if (_returnQuotes) 99 _token.append(c); 100 state = 2; 101 } else if (c == '\"' && _double) { 102 if (_returnQuotes) 103 _token.append(c); 104 state = 3; 105 } else { 106 _token.append(c); 107 _hasToken = true; 108 state = 1; 109 } 110 break; 111 112 case 1: // Token 113 _hasToken = true; 114 if (_delim.indexOf(c) >= 0) { 115 if (_returnDelimiters) 116 _i--; 117 return _hasToken; 118 } else if (c == '\'' && _single) { 119 if (_returnQuotes) 120 _token.append(c); 121 state = 2; 122 } else if (c == '\"' && _double) { 123 if (_returnQuotes) 124 _token.append(c); 125 state = 3; 126 } else { 127 _token.append(c); 128 } 129 break; 130 131 case 2: // Single Quote 132 _hasToken = true; 133 if (escape) { 134 escape = false; 135 _token.append(c); 136 } else if (c == '\'') { 137 if (_returnQuotes) 138 _token.append(c); 139 state = 1; 140 } else if (c == '\\') { 141 if (_returnQuotes) 142 _token.append(c); 143 escape = true; 144 } else { 145 _token.append(c); 146 } 147 break; 148 149 case 3: // Double Quote 150 _hasToken = true; 151 if (escape) { 152 escape = false; 153 _token.append(c); 154 } else if (c == '\"') { 155 if (_returnQuotes) 156 _token.append(c); 157 state = 1; 158 } else if (c == '\\') { 159 if (_returnQuotes) 160 _token.append(c); 161 escape = true; 162 } else { 163 _token.append(c); 164 } 165 break; 166 167 default: 168 break; 169 } 170 } 171 172 return _hasToken; 173 } 174 175 override 176 string nextToken() { 177 if (!hasMoreTokens() || _token is null) 178 throw new NoSuchElementException(""); 179 string t = _token.toString(); 180 _token.setLength(0); 181 _hasToken = false; 182 return t; 183 } 184 185 override 186 string nextToken(string delim) { 187 _delim = delim; 188 _i = _lastStart; 189 _token.setLength(0); 190 _hasToken = false; 191 return nextToken(); 192 } 193 194 195 /** 196 * Not implemented. 197 */ 198 override 199 int countTokens() { 200 return -1; 201 } 202 203 /** 204 * Quote a string. The string is quoted only if quoting is required due to 205 * embedded delimiters, quote characters or the empty string. 206 * 207 * @param s 208 * The string to quote. 209 * @param delim 210 * the delimiter to use to quote the string 211 * @return quoted string 212 */ 213 static string quoteIfNeeded(string s, string delim) { 214 if (s is null) 215 return null; 216 if (s.length == 0) 217 return "\"\""; 218 219 for (int i = 0; i < s.length; i++) { 220 char c = s[i]; 221 if (c == '\\' || c == '"' || c == '\'' || std.ascii.isWhite(c) || delim.indexOf(c) >= 0) { 222 StringBuffer b = new StringBuffer(s.length + 8); 223 quote(b, s); 224 return b.toString(); 225 } 226 } 227 228 return s; 229 } 230 231 /** 232 * Quote a string. The string is quoted only if quoting is required due to 233 * embeded delimiters, quote characters or the empty string. 234 * 235 * @param s 236 * The string to quote. 237 * @return quoted string 238 */ 239 static string quote(string s) { 240 if (s is null) 241 return null; 242 if (s.length == 0) 243 return "\"\""; 244 245 StringBuffer b = new StringBuffer(s.length + 8); 246 quote(b, s); 247 return b.toString(); 248 249 } 250 251 private __gshared char[] escapes; // = new char[32]; 252 253 shared static this() { 254 // escapes[] = cast(char) 0xFFFF; 255 escapes = new char[32]; 256 escapes[] = cast(char) 0xFF; 257 // for(size_t i=0; i<escapes.length; i++) 258 // escapes[i] = cast(char) 0xFFFF; 259 escapes['\b'] = 'b'; 260 escapes['\t'] = 't'; 261 escapes['\n'] = 'n'; 262 escapes['\f'] = 'f'; 263 escapes['\r'] = 'r'; 264 } 265 266 /** 267 * Quote a string into an Appendable. Only quotes and backslash are escaped. 268 * 269 * @param buffer 270 * The Appendable 271 * @param input 272 * The string to quote. 273 */ 274 static void quoteOnly(Appendable buffer, string input) { 275 if (input is null) 276 return; 277 278 try { 279 buffer.append('"'); 280 for (int i = 0; i < input.length; ++i) { 281 char c = input[i]; 282 if (c == '"' || c == '\\') 283 buffer.append('\\'); 284 buffer.append(c); 285 } 286 buffer.append('"'); 287 } catch (IOException x) { 288 throw new RuntimeException(x); 289 } 290 } 291 292 /** 293 * Quote a string into an Appendable. The characters ", \, \n, \r, \t, \f 294 * and \b are escaped 295 * 296 * @param buffer 297 * The Appendable 298 * @param input 299 * The string to quote. 300 */ 301 static void quote(Appendable buffer, string input) { 302 if (input is null) 303 return; 304 305 try { 306 buffer.append('"'); 307 for (int i = 0; i < input.length; ++i) { 308 char c = input[i]; 309 if (c >= 32) { 310 if (c == '"' || c == '\\') 311 buffer.append('\\'); 312 buffer.append(c); 313 } else { 314 char escape = escapes[c]; 315 if (escape == 0xFFFF) { 316 // Unicode escape 317 buffer.append('\\').append('u').append('0').append('0'); 318 if (c < 0x10) 319 buffer.append('0'); 320 buffer.append(to!string(cast(int)c, 16)); 321 } else { 322 buffer.append('\\').append(escape); 323 } 324 } 325 } 326 buffer.append('"'); 327 } catch (IOException x) { 328 throw new RuntimeException(x); 329 } 330 } 331 332 static string unquoteOnly(string s) { 333 return unquoteOnly(s, false); 334 } 335 336 /** 337 * Unquote a string, NOT converting unicode sequences 338 * 339 * @param s 340 * The string to unquote. 341 * @param lenient 342 * if true, will leave in backslashes that aren't valid escapes 343 * @return quoted string 344 */ 345 static string unquoteOnly(string s, bool lenient) { 346 if (s is null) 347 return null; 348 if (s.length < 2) 349 return s; 350 351 char first = s.charAt(0); 352 char last = s.charAt(cast(int)s.length - 1); 353 if (first != last || (first != '"' && first != '\'')) 354 return s; 355 356 StringBuilder b = new StringBuilder(cast(int)s.length - 2); 357 bool escape = false; 358 for (int i = 1; i < s.length - 1; i++) { 359 char c = s[i]; 360 361 if (escape) { 362 escape = false; 363 if (lenient && !isValidEscaping(c)) { 364 b.append('\\'); 365 } 366 b.append(c); 367 } else if (c == '\\') { 368 escape = true; 369 } else { 370 b.append(c); 371 } 372 } 373 374 return b.toString(); 375 } 376 377 static string unquote(string s) { 378 return unquote(s, false); 379 } 380 381 /** 382 * Unquote a string. 383 * 384 * @param s 385 * The string to unquote. 386 * @param lenient 387 * true if unquoting should be lenient to escaped content, 388 * leaving some alone, false if string unescaping 389 * @return quoted string 390 */ 391 static string unquote(string s, bool lenient) { 392 if (s is null) 393 return null; 394 if (s.length < 2) 395 return s; 396 397 char first = s.charAt(0); 398 char last = s.charAt(cast(int)s.length - 1); 399 if (first != last || (first != '"' && first != '\'')) 400 return s; 401 402 StringBuilder b = new StringBuilder(cast(int)s.length - 2); 403 bool escape = false; 404 for (int i = 1; i < cast(int)s.length - 1; i++) { 405 char c = s[i]; 406 407 if (escape) { 408 escape = false; 409 switch (c) { 410 case 'n': 411 b.append('\n'); 412 break; 413 case 'r': 414 b.append('\r'); 415 break; 416 case 't': 417 b.append('\t'); 418 break; 419 case 'f': 420 b.append('\f'); 421 break; 422 case 'b': 423 b.append('\b'); 424 break; 425 case '\\': 426 b.append('\\'); 427 break; 428 case '/': 429 b.append('/'); 430 break; 431 case '"': 432 b.append('"'); 433 break; 434 case 'u': 435 b.append(cast(char) ((ConverterUtils.convertHexDigit(cast(byte) s.charAt(i++)) << 24) 436 + (ConverterUtils.convertHexDigit(cast(byte) s.charAt(i++)) << 16) 437 + (ConverterUtils.convertHexDigit(cast(byte) s.charAt(i++)) << 8) 438 + (ConverterUtils.convertHexDigit(cast(byte) s.charAt(i++))))); 439 break; 440 default: 441 if (lenient && !isValidEscaping(c)) { 442 b.append('\\'); 443 } 444 b.append(c); 445 } 446 } else if (c == '\\') { 447 escape = true; 448 } else { 449 b.append(c); 450 } 451 } 452 453 return b.toString(); 454 } 455 456 /** 457 * Check that char c (which is preceded by a backslash) is a valid escape 458 * sequence. 459 * 460 * @param c 461 * @return 462 */ 463 private static bool isValidEscaping(char c) { 464 return ((c == 'n') || (c == 'r') || (c == 't') || (c == 'f') || (c == 'b') || (c == '\\') || (c == '/') 465 || (c == '"') || (c == 'u')); 466 } 467 468 static bool isQuoted(string s) { 469 return s !is null && s.length > 0 && s.charAt(0) == '"' && s.charAt(cast(int)s.length - 1) == '"'; 470 } 471 472 /** 473 * @return handle double quotes if true 474 */ 475 bool getDouble() { 476 return _double; 477 } 478 479 /** 480 * @param d 481 * handle double quotes if true 482 */ 483 void setDouble(bool d) { 484 _double = d; 485 } 486 487 /** 488 * @return handle single quotes if true 489 */ 490 bool getSingle() { 491 return _single; 492 } 493 494 /** 495 * @param single 496 * handle single quotes if true 497 */ 498 void setSingle(bool single) { 499 _single = single; 500 } 501 }