1 /*
2  * Hunt - A refined core library for D programming language.
3  *
4  * Copyright (C) 2018-2019 HuntLabs
5  *
6  * Website: https://www.huntlabs.net/
7  *
8  * Licensed under the Apache-2.0 License.
9  *
10  */
11 
12 module hunt.text.QuotedStringTokenizer;
13 
14 import std.conv;
15 import std.ascii;
16 import std.string;
17 
18 import hunt.collection.StringBuffer;
19 import hunt.text.StringTokenizer;
20 import hunt.text.Common;
21 import hunt.text.StringBuilder;
22 import hunt.Exceptions;
23 import hunt.util.Common;
24 import hunt.util.ConverterUtils;
25 
26 
27 /**
28  * StringTokenizer with Quoting support.
29  *
30  * This class is a copy of the java.util.StringTokenizer API and the behaviour
31  * is the same, except that single and double quoted string values are
32  * recognised. Delimiters within quotes are not considered delimiters. Quotes
33  * can be escaped with '\'.
34  *
35  * @see java.util.StringTokenizer
36  *
37  */
38 class QuotedStringTokenizer : StringTokenizer {
39 	private enum string __delim = "\t\n\r";
40 	private string _string;
41 	private string _delim = __delim;
42 	private bool _returnQuotes = false;
43 	private bool _returnDelimiters = false;
44 	private StringBuffer _token;
45 	private bool _hasToken = false;
46 	private int _i = 0;
47 	private int _lastStart = 0;
48 	private bool _double = true;
49 	private bool _single = true;
50 
51 	this(string str, string delim, bool returnDelimiters, bool returnQuotes) {
52 		super("");
53 		_string = str;
54 		if (delim !is null)
55 			_delim = delim;
56 		_returnDelimiters = returnDelimiters;
57 		_returnQuotes = returnQuotes;
58 
59 		if (_delim.indexOf('\'') >= 0 || _delim.indexOf('"') >= 0)
60 			throw new Error("Can't use quotes as delimiters: " ~ _delim);
61 
62 		_token = new StringBuffer(_string.length > 1024 ? 512 : _string.length / 2);
63 	}
64 
65 	this(string str, string delim, bool returnDelimiters) {
66 		this(str, delim, returnDelimiters, false);
67 	}
68 
69 	this(string str, string delim) {
70 		this(str, delim, false, false);
71 	}
72 
73 	this(string str) {
74 		this(str, null, false, false);
75 	}
76 
77 	override
78 	bool hasMoreTokens() {
79 		// Already found a token
80 		if (_hasToken)
81 			return true;
82 
83 		_lastStart = _i;
84 
85 		int state = 0;
86 		bool escape = false;
87 		while (_i < _string.length) {
88 			char c = _string.charAt(_i++);
89 
90 			switch (state) {
91 			case 0: // Start
92 				if (_delim.indexOf(c) >= 0) {
93 					if (_returnDelimiters) {
94 						_token.append(c);
95 						return _hasToken = true;
96 					}
97 				} else if (c == '\'' && _single) {
98 					if (_returnQuotes)
99 						_token.append(c);
100 					state = 2;
101 				} else if (c == '\"' && _double) {
102 					if (_returnQuotes)
103 						_token.append(c);
104 					state = 3;
105 				} else {
106 					_token.append(c);
107 					_hasToken = true;
108 					state = 1;
109 				}
110 				break;
111 
112 			case 1: // Token
113 				_hasToken = true;
114 				if (_delim.indexOf(c) >= 0) {
115 					if (_returnDelimiters)
116 						_i--;
117 					return _hasToken;
118 				} else if (c == '\'' && _single) {
119 					if (_returnQuotes)
120 						_token.append(c);
121 					state = 2;
122 				} else if (c == '\"' && _double) {
123 					if (_returnQuotes)
124 						_token.append(c);
125 					state = 3;
126 				} else {
127 					_token.append(c);
128 				}
129 				break;
130 
131 			case 2: // Single Quote
132 				_hasToken = true;
133 				if (escape) {
134 					escape = false;
135 					_token.append(c);
136 				} else if (c == '\'') {
137 					if (_returnQuotes)
138 						_token.append(c);
139 					state = 1;
140 				} else if (c == '\\') {
141 					if (_returnQuotes)
142 						_token.append(c);
143 					escape = true;
144 				} else {
145 					_token.append(c);
146 				}
147 				break;
148 
149 			case 3: // Double Quote
150 				_hasToken = true;
151 				if (escape) {
152 					escape = false;
153 					_token.append(c);
154 				} else if (c == '\"') {
155 					if (_returnQuotes)
156 						_token.append(c);
157 					state = 1;
158 				} else if (c == '\\') {
159 					if (_returnQuotes)
160 						_token.append(c);
161 					escape = true;
162 				} else {
163 					_token.append(c);
164 				}
165 				break;
166 
167             default:
168                 break;
169 			}
170 		}
171 
172 		return _hasToken;
173 	}
174 
175 	override
176 	string nextToken() {
177 		if (!hasMoreTokens() || _token is null)
178 			throw new NoSuchElementException("");
179 		string t = _token.toString();
180 		_token.setLength(0);
181 		_hasToken = false;
182 		return t;
183 	}
184 
185 	override
186 	string nextToken(string delim) {
187 		_delim = delim;
188 		_i = _lastStart;
189 		_token.setLength(0);
190 		_hasToken = false;
191 		return nextToken();
192 	}
193 
194 
195 	/**
196 	 * Not implemented.
197 	 */
198 	override
199 	int countTokens() {
200 		return -1;
201 	}
202 
203 	/**
204 	 * Quote a string. The string is quoted only if quoting is required due to
205 	 * embedded delimiters, quote characters or the empty string.
206 	 * 
207 	 * @param s
208 	 *            The string to quote.
209 	 * @param delim
210 	 *            the delimiter to use to quote the string
211 	 * @return quoted string
212 	 */
213 	static string quoteIfNeeded(string s, string delim) {
214 		if (s is null)
215 			return null;
216 		if (s.length == 0)
217 			return "\"\"";
218 
219 		for (int i = 0; i < s.length; i++) {
220 			char c = s[i];
221 			if (c == '\\' || c == '"' || c == '\'' || std.ascii.isWhite(c) || delim.indexOf(c) >= 0) {
222 				StringBuffer b = new StringBuffer(s.length + 8);
223 				quote(b, s);
224 				return b.toString();
225 			}
226 		}
227 
228 		return s;
229 	}
230 
231 	/**
232 	 * Quote a string. The string is quoted only if quoting is required due to
233 	 * embeded delimiters, quote characters or the empty string.
234 	 * 
235 	 * @param s
236 	 *            The string to quote.
237 	 * @return quoted string
238 	 */
239 	static string quote(string s) {
240 		if (s is null)
241 			return null;
242 		if (s.length == 0)
243 			return "\"\"";
244 
245 		StringBuffer b = new StringBuffer(s.length + 8);
246 		quote(b, s);
247 		return b.toString();
248 
249 	}
250 
251 	private __gshared char[] escapes; // = new char[32];
252 
253 	shared static this() {
254         // escapes[] = cast(char) 0xFFFF;
255 		escapes = new char[32];
256 		escapes[] = cast(char) 0xFF;
257 		// for(size_t i=0; i<escapes.length; i++)
258 		// 	escapes[i] = cast(char) 0xFFFF;
259 		escapes['\b'] = 'b';
260 		escapes['\t'] = 't';
261 		escapes['\n'] = 'n';
262 		escapes['\f'] = 'f';
263 		escapes['\r'] = 'r';
264 	}
265 
266 	/**
267 	 * Quote a string into an Appendable. Only quotes and backslash are escaped.
268 	 * 
269 	 * @param buffer
270 	 *            The Appendable
271 	 * @param input
272 	 *            The string to quote.
273 	 */
274 	static void quoteOnly(Appendable buffer, string input) {
275 		if (input is null)
276 			return;
277 
278 		try {
279 			buffer.append('"');
280 			for (int i = 0; i < input.length; ++i) {
281 				char c = input[i];
282 				if (c == '"' || c == '\\')
283 					buffer.append('\\');
284 				buffer.append(c);
285 			}
286 			buffer.append('"');
287 		} catch (IOException x) {
288 			throw new RuntimeException(x);
289 		}
290 	}
291 
292 	/**
293 	 * Quote a string into an Appendable. The characters ", \, \n, \r, \t, \f
294 	 * and \b are escaped
295 	 * 
296 	 * @param buffer
297 	 *            The Appendable
298 	 * @param input
299 	 *            The string to quote.
300 	 */
301 	static void quote(Appendable buffer, string input) {
302 		if (input is null)
303 			return;
304 
305 		try {
306 			buffer.append('"');
307 			for (int i = 0; i < input.length; ++i) {
308 				char c = input[i];
309 				if (c >= 32) {
310 					if (c == '"' || c == '\\')
311 						buffer.append('\\');
312 					buffer.append(c);
313 				} else {
314 					char escape = escapes[c];
315 					if (escape == 0xFFFF) {
316 						// Unicode escape
317 						buffer.append('\\').append('u').append('0').append('0');
318 						if (c < 0x10)
319 							buffer.append('0');
320 						buffer.append(to!string(cast(int)c, 16));
321 					} else {
322 						buffer.append('\\').append(escape);
323 					}
324 				}
325 			}
326 			buffer.append('"');
327 		} catch (IOException x) {
328 			throw new RuntimeException(x);
329 		}
330 	}
331 
332 	static string unquoteOnly(string s) {
333 		return unquoteOnly(s, false);
334 	}
335 
336 	/**
337 	 * Unquote a string, NOT converting unicode sequences
338 	 * 
339 	 * @param s
340 	 *            The string to unquote.
341 	 * @param lenient
342 	 *            if true, will leave in backslashes that aren't valid escapes
343 	 * @return quoted string
344 	 */
345 	static string unquoteOnly(string s, bool lenient) {
346 		if (s is null)
347 			return null;
348 		if (s.length < 2)
349 			return s;
350 
351 		char first = s.charAt(0);
352 		char last = s.charAt(cast(int)s.length - 1);
353 		if (first != last || (first != '"' && first != '\''))
354 			return s;
355 
356 		StringBuilder b = new StringBuilder(cast(int)s.length - 2);
357 		bool escape = false;
358 		for (int i = 1; i < s.length - 1; i++) {
359 			char c = s[i];
360 
361 			if (escape) {
362 				escape = false;
363 				if (lenient && !isValidEscaping(c)) {
364 					b.append('\\');
365 				}
366 				b.append(c);
367 			} else if (c == '\\') {
368 				escape = true;
369 			} else {
370 				b.append(c);
371 			}
372 		}
373 
374 		return b.toString();
375 	}
376 
377 	static string unquote(string s) {
378 		return unquote(s, false);
379 	}
380 
381 	/**
382 	 * Unquote a string.
383 	 * 
384 	 * @param s
385 	 *            The string to unquote.
386 	 * @param lenient
387 	 *            true if unquoting should be lenient to escaped content,
388 	 *            leaving some alone, false if string unescaping
389 	 * @return quoted string
390 	 */
391 	static string unquote(string s, bool lenient) {
392 		if (s is null)
393 			return null;
394 		if (s.length < 2)
395 			return s;
396 
397 		char first = s.charAt(0);
398 		char last = s.charAt(cast(int)s.length - 1);
399 		if (first != last || (first != '"' && first != '\''))
400 			return s;
401 
402 		StringBuilder b = new StringBuilder(cast(int)s.length - 2);
403 		bool escape = false;
404 		for (int i = 1; i < cast(int)s.length - 1; i++) {
405 			char c = s[i];
406 
407 			if (escape) {
408 				escape = false;
409 				switch (c) {
410 				case 'n':
411 					b.append('\n');
412 					break;
413 				case 'r':
414 					b.append('\r');
415 					break;
416 				case 't':
417 					b.append('\t');
418 					break;
419 				case 'f':
420 					b.append('\f');
421 					break;
422 				case 'b':
423 					b.append('\b');
424 					break;
425 				case '\\':
426 					b.append('\\');
427 					break;
428 				case '/':
429 					b.append('/');
430 					break;
431 				case '"':
432 					b.append('"');
433 					break;
434 				case 'u':
435 					b.append(cast(char) ((ConverterUtils.convertHexDigit(cast(byte) s.charAt(i++)) << 24)
436 							+ (ConverterUtils.convertHexDigit(cast(byte) s.charAt(i++)) << 16)
437 							+ (ConverterUtils.convertHexDigit(cast(byte) s.charAt(i++)) << 8)
438 							+ (ConverterUtils.convertHexDigit(cast(byte) s.charAt(i++)))));
439 					break;
440 				default:
441 					if (lenient && !isValidEscaping(c)) {
442 						b.append('\\');
443 					}
444 					b.append(c);
445 				}
446 			} else if (c == '\\') {
447 				escape = true;
448 			} else {
449 				b.append(c);
450 			}
451 		}
452 
453 		return b.toString();
454 	}
455 
456 	/**
457 	 * Check that char c (which is preceded by a backslash) is a valid escape
458 	 * sequence.
459 	 * 
460 	 * @param c
461 	 * @return
462 	 */
463 	private static bool isValidEscaping(char c) {
464 		return ((c == 'n') || (c == 'r') || (c == 't') || (c == 'f') || (c == 'b') || (c == '\\') || (c == '/')
465 				|| (c == '"') || (c == 'u'));
466 	}
467 
468 	static bool isQuoted(string s) {
469 		return s !is null && s.length > 0 && s.charAt(0) == '"' && s.charAt(cast(int)s.length - 1) == '"';
470 	}
471 
472 	/**
473 	 * @return handle double quotes if true
474 	 */
475 	bool getDouble() {
476 		return _double;
477 	}
478 
479 	/**
480 	 * @param d
481 	 *            handle double quotes if true
482 	 */
483 	void setDouble(bool d) {
484 		_double = d;
485 	}
486 
487 	/**
488 	 * @return handle single quotes if true
489 	 */
490 	bool getSingle() {
491 		return _single;
492 	}
493 
494 	/**
495 	 * @param single
496 	 *            handle single quotes if true
497 	 */
498 	void setSingle(bool single) {
499 		_single = single;
500 	}
501 }