hunt.text.StringUtils source code

1 /*
2  * Hunt - A refined core library for D programming language.
3  *
4  * Copyright (C) 2018-2019 HuntLabs
5  *
6  * Website: https://www.huntlabs.net/
7  *
8  * Licensed under the Apache-2.0 License.
9  *
10  */
11 
12 module hunt.text.StringUtils;
13 
14 import std.array;
15 import std.ascii;
16 import std.container.array;
17 import std.conv;
18 import std.range;
19 import std.string;
20 import std.uni;
21 
22 import hunt.collection.ArrayTrie;
23 import hunt.collection.Trie;
24 import hunt.text.Common;
25 
26 /**
27 */
28 class StringUtils {
29     private enum string FOLDER_SEPARATOR = "/";
30     private enum string WINDOWS_FOLDER_SEPARATOR = "\\";
31     private enum string TOP_PATH = "..";
32     private enum string CURRENT_PATH = ".";
33     private enum char EXTENSION_SEPARATOR = '.';
34 
35     enum string EMPTY = "";
36     enum string[] EMPTY_STRING_ARRAY = [];
37 
38     enum char[] lowercases = ['\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010',
39             '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025',
40             '\026', '\027', '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', '\040', '\041', '\042',
41             '\043', '\044', '\045', '\046', '\047', '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
42             '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', '\070', '\071', '\072', '\073', '\074',
43             '\075', '\076', '\077', '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', '\150', '\151',
44             '\152', '\153', '\154', '\155', '\156', '\157', '\160', '\161', '\162', '\163', '\164', '\165', '\166',
45             '\167', '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137', '\140', '\141', '\142', '\143',
46             '\144', '\145', '\146', '\147', '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', '\160',
47             '\161', '\162', '\163', '\164', '\165', '\166', '\167', '\170', '\171', '\172', '\173', '\174', '\175',
48             '\176', '\177'];
49 
50     enum string __ISO_8859_1 = "iso-8859-1";
51     enum string __UTF8 = "utf-8";
52     enum string __UTF16 = "utf-16";
53     
54     // private enum string[string] CHARSETS = ["utf-8":__UTF8, "utf8":__UTF8, 
55     //     "utf-16":__UTF16, "utf-8":__UTF16, 
56     //     "iso-8859-1":__ISO_8859_1, "iso_8859_1":__ISO_8859_1];
57 
58     private __gshared Trie!string CHARSETS;
59 
60     shared static this() {
61         CHARSETS = new ArrayTrie!string(256);
62 
63         CHARSETS.put("utf-8", __UTF8);
64         CHARSETS.put("utf8", __UTF8);
65         CHARSETS.put("utf-16", __UTF16);
66         CHARSETS.put("utf16", __UTF16);
67         CHARSETS.put("iso-8859-1", __ISO_8859_1);
68         CHARSETS.put("iso_8859_1", __ISO_8859_1);
69     }
70 
71     
72     /**
73      * Convert alternate charset names (eg utf8) to normalized name (eg UTF-8).
74      *
75      * @param s the charset to normalize
76      * @return the normalized charset (or null if normalized version not found)
77      */
78     static string normalizeCharset(string s) {
79         string n = CHARSETS.get(s);
80         return (n is null) ? s : n;
81     }
82 
83     /**
84      * Convert alternate charset names (eg utf8) to normalized name (eg UTF-8).
85      *
86      * @param s      the charset to normalize
87      * @param offset the offset in the charset
88      * @param length the length of the charset in the input param
89      * @return the normalized charset (or null if not found)
90      */
91     static string normalizeCharset(string s, int offset, int length) {
92         return normalizeCharset(s[offset .. offset+length]);
93     }
94 
95     static string asciiToLowerCase(string s) {
96         return toLower(s);
97     }
98 
99     static int toInt(string str, int from) {
100         return to!int(str[from..$]);
101     }
102 
103     static byte[] getBytes(string s) {
104         return cast(byte[])s.dup;
105     }
106 
107     static string randomId(size_t n=10, string str = letters) {
108         import std.random : randomSample;
109         import std.utf : byCodeUnit;
110         return str.byCodeUnit.randomSample(n).to!string;
111     }
112 
113     // Splitting
114     // -----------------------------------------------------------------------
115 
116     /**
117      * <p>
118      * Splits the provided text into an array, using whitespace as the
119      * separator. Whitespace is defined by {@link Character#isWhitespace(char)}.
120      * </p>
121      * <p>
122      * <p>
123      * The separator is not included in the returned string array. Adjacent
124      * separators are treated as one separator. For more control over the split
125      * use the StrTokenizer class.
126      * </p>
127      * <p>
128      * <p>
129      * A <code>null</code> input string returns <code>null</code>.
130      * </p>
131      * <p>
132      * <pre>
133      * StringUtils.split(null)       = null
134      * StringUtils.split("")         = []
135      * StringUtils.split("abc def")  = ["abc", "def"]
136      * StringUtils.split("abc  def") = ["abc", "def"]
137      * StringUtils.split(" abc ")    = ["abc"]
138      * </pre>
139      *
140      * @param str the string to parse, may be null
141      * @return an array of parsed Strings, <code>null</code> if null string
142      * input
143      */
144     static string[] split(string str) {
145         return split(str, null, -1);
146     }
147 
148     /**
149      * <p>
150      * Splits the provided text into an array, separators specified. This is an
151      * alternative to using StringTokenizer.
152      * </p>
153      * <p>
154      * <p>
155      * The separator is not included in the returned string array. Adjacent
156      * separators are treated as one separator. For more control over the split
157      * use the StrTokenizer class.
158      * </p>
159      * <p>
160      * <p>
161      * A <code>null</code> input string returns <code>null</code>. A
162      * <code>null</code> separatorChars splits on whitespace.
163      * </p>
164      * <p>
165      * <pre>
166      * StringUtils.split(null, *)         = null
167      * StringUtils.split("", *)           = []
168      * StringUtils.split("abc def", null) = ["abc", "def"]
169      * StringUtils.split("abc def", " ")  = ["abc", "def"]
170      * StringUtils.split("abc  def", " ") = ["abc", "def"]
171      * StringUtils.split("ab:cd:ef", ":") = ["ab", "cd", "ef"]
172      * </pre>
173      *
174      * @param str            the string to parse, may be null
175      * @param separatorChars the characters used as the delimiters, <code>null</code>
176      *                       splits on whitespace
177      * @return an array of parsed Strings, <code>null</code> if null string
178      * input
179      */
180     static string[] split(string str, string separatorChars) {
181         return splitWorker(str, separatorChars, -1, false);
182     }
183 
184     /**
185      * <p>
186      * Splits the provided text into an array, separator specified. This is an
187      * alternative to using StringTokenizer.
188      * </p>
189      * <p>
190      * <p>
191      * The separator is not included in the returned string array. Adjacent
192      * separators are treated as one separator. For more control over the split
193      * use the StrTokenizer class.
194      * </p>
195      * <p>
196      * <p>
197      * A <code>null</code> input string returns <code>null</code>.
198      * </p>
199      * <p>
200      * <pre>
201      * StringUtils.split(null, *)         = null
202      * StringUtils.split("", *)           = []
203      * StringUtils.split("a.b.c", '.')    = ["a", "b", "c"]
204      * StringUtils.split("a..b.c", '.')   = ["a", "b", "c"]
205      * StringUtils.split("a:b:c", '.')    = ["a:b:c"]
206      * StringUtils.split("a b c", ' ')    = ["a", "b", "c"]
207      * </pre>
208      *
209      * @param str           the string to parse, may be null
210      * @param separatorChar the character used as the delimiter
211      * @return an array of parsed Strings, <code>null</code> if null string
212      * input
213      * @since 2.0
214      */
215     static string[] split(string str, char separatorChar) {
216         return splitWorker(str, separatorChar, false);
217     }
218 
219     /**
220      * <p>
221      * Splits the provided text into an array with a maximum length, separators
222      * specified.
223      * </p>
224      * <p>
225      * <p>
226      * The separator is not included in the returned string array. Adjacent
227      * separators are treated as one separator.
228      * </p>
229      * <p>
230      * <p>
231      * A <code>null</code> input string returns <code>null</code>. A
232      * <code>null</code> separatorChars splits on whitespace.
233      * </p>
234      * <p>
235      * <p>
236      * If more than <code>max</code> delimited substrings are found, the last
237      * returned string includes all characters after the first
238      * <code>max - 1</code> returned strings (including separator characters).
239      * </p>
240      * <p>
241      * <pre>
242      * StringUtils.split(null, *, *)            = null
243      * StringUtils.split("", *, *)              = []
244      * StringUtils.split("ab de fg", null, 0)   = ["ab", "cd", "ef"]
245      * StringUtils.split("ab   de fg", null, 0) = ["ab", "cd", "ef"]
246      * StringUtils.split("ab:cd:ef", ":", 0)    = ["ab", "cd", "ef"]
247      * StringUtils.split("ab:cd:ef", ":", 2)    = ["ab", "cd:ef"]
248      * </pre>
249      *
250      * @param str            the string to parse, may be null
251      * @param separatorChars the characters used as the delimiters, <code>null</code>
252      *                       splits on whitespace
253      * @param max            the maximum number of elements to include in the array. A zero
254      *                       or negative value implies no limit
255      * @return an array of parsed Strings, <code>null</code> if null string
256      * input
257      */
258     static string[] split(string str, string separatorChars, int max) {
259         return splitWorker(str, separatorChars, max, false);
260     }
261 
262     /**
263      * Performs the logic for the <code>split</code> and
264      * <code>splitPreserveAllTokens</code> methods that return a maximum array
265      * length.
266      *
267      * @param str               the string to parse, may be <code>null</code>
268      * @param separatorChars    the separate character
269      * @param max               the maximum number of elements to include in the array. A zero
270      *                          or negative value implies no limit.
271      * @param preserveAllTokens if <code>true</code>, adjacent separators are treated as empty
272      *                          token separators; if <code>false</code>, adjacent separators
273      *                          are treated as one separator.
274      * @return an array of parsed Strings, <code>null</code> if null string
275      * input
276      */
277     private static string[] splitWorker(string str, string separatorChars, int max, bool preserveAllTokens) {
278         // Performance tuned for 2.0 (JDK1.4)
279         // Direct code is quicker than StringTokenizer.
280         // Also, StringTokenizer uses isSpace() not isWhitespace()
281 
282         if (str is null) {
283             return null;
284         }
285         int len = cast(int)str.length;
286         if (len == 0) {
287             return EMPTY_STRING_ARRAY;
288         }
289 
290         string[] list; // = new ArrayList!(string)();
291         int sizePlus1 = 1;
292         int i = 0, start = 0;
293         bool match = false;
294         bool lastMatch = false;
295         if (separatorChars is null) {
296             // Null separator means use whitespace
297             while (i < len) {                
298                 if (std.ascii.isWhite(str[i])) {
299                     if (match || preserveAllTokens) {
300                         lastMatch = true;
301                         if (sizePlus1++ == max) {
302                             i = len;
303                             lastMatch = false;
304                         }
305                         list ~= (str.substring(start, i));
306                         match = false;
307                     }
308                     start = ++i;
309                     continue;
310                 }
311                 lastMatch = false;
312                 match = true;
313                 i++;
314             }
315         } else if (separatorChars.length == 1) {
316             // Optimise 1 character case
317             char sep = separatorChars[0];
318             while (i < len) {
319                 if (str[i] == sep) {
320                     if (match || preserveAllTokens) {
321                         lastMatch = true;
322                         if (sizePlus1++ == max) {
323                             i = len;
324                             lastMatch = false;
325                         }
326                         list  ~= (str.substring(start, i));
327                         match = false;
328                     }
329                     start = ++i;
330                     continue;
331                 }
332                 lastMatch = false;
333                 match = true;
334                 i++;
335             }
336         } else {
337             // standard case
338             while (i < len) {
339                 if (separatorChars.indexOf(str[i]) >= 0) {
340                     if (match || preserveAllTokens) {
341                         lastMatch = true;
342                         if (sizePlus1++ == max) {
343                             i = len;
344                             lastMatch = false;
345                         }
346                         list ~= (str.substring(start, i));
347                         match = false;
348                     }
349                     start = ++i;
350                     continue;
351                 }
352                 lastMatch = false;
353                 match = true;
354                 i++;
355             }
356         }
357         if (match || (preserveAllTokens && lastMatch)) {
358             list ~= (str.substring(start, i));
359         }
360         return list; //.toArray(EMPTY_STRING_ARRAY);
361     }
362 
363     /**
364      * Performs the logic for the <code>split</code> and
365      * <code>splitPreserveAllTokens</code> methods that do not return a maximum
366      * array length.
367      *
368      * @param str               the string to parse, may be <code>null</code>
369      * @param separatorChar     the separate character
370      * @param preserveAllTokens if <code>true</code>, adjacent separators are treated as empty
371      *                          token separators; if <code>false</code>, adjacent separators
372      *                          are treated as one separator.
373      * @return an array of parsed Strings, <code>null</code> if null string
374      * input
375      */
376     private static string[] splitWorker(string str, char separatorChar, bool preserveAllTokens) {
377         // Performance tuned for 2.0 (JDK1.4)
378 
379         if (str is null) {
380             return null;
381         }
382         int len = cast(int)str.length;
383         if (len == 0) {
384             return EMPTY_STRING_ARRAY;
385         }
386         string[] list; // = new ArrayList!(string)();
387         int i = 0, start = 0;
388         bool match = false;
389         bool lastMatch = false;
390         while (i < len) {
391             if (str[i] == separatorChar) {
392                 if (match || preserveAllTokens) {
393                     list ~= (str.substring(start, i));
394                     match = false;
395                     lastMatch = true;
396                 }
397                 start = ++i;
398                 continue;
399             }
400             lastMatch = false;
401             match = true;
402             i++;
403         }
404         if (match || (preserveAllTokens && lastMatch)) {
405             list ~= (str.substring(start, i));
406         }
407         return list;
408     }
409 
410 
411 
412 	/**
413 	 * Copy the given Enumeration into a {@code string} array.
414 	 * The Enumeration must contain {@code string} elements only.
415 	 * @param enumeration the Enumeration to copy
416 	 * @return the {@code string} array
417 	 */
418 	static string[] toStringArray(InputRange!string range) {
419         Array!string buffer;
420         foreach(string s; range) {
421             buffer.insertBack(s);
422         }
423 		return buffer.array;
424 	}
425 
426 
427 	/**
428 	 * Convert a {@code string} array into a delimited {@code string} (e.g. CSV).
429 	 * <p>Useful for {@code toString()} implementations.
430 	 * @param arr the array to display (potentially {@code null} or empty)
431 	 * @param delim the delimiter to use (typically a ",")
432 	 * @return the delimited {@code string}
433 	 */
434 	static string arrayToDelimitedString(string[] arr, string delim) {
435 		if (arr.length == 0) {
436 			return "";
437 		}
438 		if (arr.length == 1) {
439 			return arr[0];
440 		}
441 
442         Appender!string sb;
443 		for (size_t i = 0; i < arr.length; i++) {
444 			if (i > 0) {
445 				sb.put(delim);
446 			}
447 			sb.put(arr[i]);
448 		}
449 		return sb.data;
450 	}
451 
452 	/**
453 	 * Convert a {@code string} array into a comma delimited {@code string}
454 	 * (i.e., CSV).
455 	 * <p>Useful for {@code toString()} implementations.
456 	 * @param arr the array to display (potentially {@code null} or empty)
457 	 * @return the delimited {@code string}
458 	 */
459 	static string arrayToCommaDelimitedString(string[] arr) {
460 		return arrayToDelimitedString(arr, ",");
461 	}
462 
463 
464 	/**
465 	 * Convert a comma delimited list (e.g., a row from a CSV file) into an
466 	 * array of strings.
467 	 * @param str the input {@code string} (potentially {@code null} or empty)
468 	 * @return an array of strings, or the empty array in case of empty input
469 	 */
470 	static string[] commaDelimitedListToStringArray(string str) {
471 		return delimitedListToStringArray(str, ",");
472 	}
473 
474 
475 	/**
476 	 * Take a {@code string} that is a delimited list and convert it into a
477 	 * {@code string} array.
478 	 * <p>A single {@code delimiter} may consist of more than one character,
479 	 * but it will still be considered as a single delimiter string, rather
480 	 * than as bunch of potential delimiter characters, in contrast to
481 	 * {@link #tokenizeToStringArray}.
482 	 * @param str the input {@code string} (potentially {@code null} or empty)
483 	 * @param delimiter the delimiter between elements (this is a single delimiter,
484 	 * rather than a bunch individual delimiter characters)
485 	 * @return an array of the tokens in the list
486 	 * @see #tokenizeToStringArray
487 	 */
488 	static string[] delimitedListToStringArray(string str, string delimiter) {
489 		return delimitedListToStringArray(str, delimiter, null);
490 	}
491 
492 	/**
493 	 * Take a {@code string} that is a delimited list and convert it into
494 	 * a {@code string} array.
495 	 * <p>A single {@code delimiter} may consist of more than one character,
496 	 * but it will still be considered as a single delimiter string, rather
497 	 * than as bunch of potential delimiter characters, in contrast to
498 	 * {@link #tokenizeToStringArray}.
499 	 * @param str the input {@code string} (potentially {@code null} or empty)
500 	 * @param delimiter the delimiter between elements (this is a single delimiter,
501 	 * rather than a bunch individual delimiter characters)
502 	 * @param charsToDelete a set of characters to delete; useful for deleting unwanted
503 	 * line breaks: e.g. "\r\n\f" will delete all new lines and line feeds in a {@code string}
504 	 * @return an array of the tokens in the list
505 	 * @see #tokenizeToStringArray
506 	 */
507 	static string[] delimitedListToStringArray(string str, 
508         string delimiter, string charsToDelete) {
509 
510 		if (str.empty()) {
511 			return [];
512 		}
513 		if (delimiter is null) {
514 			return [str];
515 		}
516 
517 		Array!string result;
518 		if ("" == delimiter) {
519 			for (size_t i = 0; i < str.length; i++) {
520 				result.insertBack(deleteAny(str[i .. i + 1], charsToDelete));
521 			}
522 		}
523 		else {
524 			size_t pos = 0;
525 			ptrdiff_t delPos;
526 			while ((delPos = str.indexOf(delimiter, pos)) != -1) {
527 				result.insertBack(deleteAny(str[pos .. delPos], charsToDelete));
528 				pos = delPos + delimiter.length;
529 			}
530 			if (str.length > 0 && pos <= str.length) {
531 				// Add rest of string, but not in case of empty input.
532 				result.insertBack(deleteAny(str[pos .. $], charsToDelete));
533 			}
534 		}
535 		return result.array;
536 	}
537 
538 
539 	/**
540 	 * Delete any character in a given {@code string}.
541 	 * @param inString the original {@code string}
542 	 * @param charsToDelete a set of characters to delete.
543 	 * E.g. "az\n" will delete 'a's, 'z's and new lines.
544 	 * @return the resulting {@code string}
545 	 */
546 	static string deleteAny(string inString, string charsToDelete) {
547 		if (inString.empty() || charsToDelete.empty()) {
548 			return inString;
549 		}
550 
551         Appender!string sb;
552 		for (size_t i = 0; i < inString.length; i++) {
553 			char c = inString[i];
554 			if (charsToDelete.indexOf(c) == -1) {
555 				sb.put(c);
556 			}
557 		}
558 		return sb.data;
559 	}
560 
561 }