1 /* 2 * Hunt - A refined core library for D programming language. 3 * 4 * Copyright (C) 2018-2019 HuntLabs 5 * 6 * Website: https://www.huntlabs.net/ 7 * 8 * Licensed under the Apache-2.0 License. 9 * 10 */ 11 12 module hunt.text.StringUtils; 13 14 import std.array; 15 import std.ascii; 16 import std.container.array; 17 import std.conv; 18 import std.range; 19 import std.string; 20 import std.uni; 21 22 import hunt.collection.ArrayTrie; 23 import hunt.collection.Trie; 24 import hunt.text.Common; 25 26 /** 27 */ 28 class StringUtils { 29 private enum string FOLDER_SEPARATOR = "/"; 30 private enum string WINDOWS_FOLDER_SEPARATOR = "\\"; 31 private enum string TOP_PATH = ".."; 32 private enum string CURRENT_PATH = "."; 33 private enum char EXTENSION_SEPARATOR = '.'; 34 35 enum string EMPTY = ""; 36 enum string[] EMPTY_STRING_ARRAY = []; 37 38 enum char[] lowercases = ['\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', 39 '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', 40 '\026', '\027', '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', '\040', '\041', '\042', 41 '\043', '\044', '\045', '\046', '\047', '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', 42 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', '\070', '\071', '\072', '\073', '\074', 43 '\075', '\076', '\077', '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', '\150', '\151', 44 '\152', '\153', '\154', '\155', '\156', '\157', '\160', '\161', '\162', '\163', '\164', '\165', '\166', 45 '\167', '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137', '\140', '\141', '\142', '\143', 46 '\144', '\145', '\146', '\147', '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', '\160', 47 '\161', '\162', '\163', '\164', '\165', '\166', '\167', '\170', '\171', '\172', '\173', '\174', '\175', 48 '\176', '\177']; 49 50 enum string __ISO_8859_1 = "iso-8859-1"; 51 enum string __UTF8 = "utf-8"; 52 enum string __UTF16 = "utf-16"; 53 54 // private enum string[string] CHARSETS = ["utf-8":__UTF8, "utf8":__UTF8, 55 // "utf-16":__UTF16, "utf-8":__UTF16, 56 // "iso-8859-1":__ISO_8859_1, "iso_8859_1":__ISO_8859_1]; 57 58 private __gshared Trie!string CHARSETS; 59 60 shared static this() { 61 CHARSETS = new ArrayTrie!string(256); 62 63 CHARSETS.put("utf-8", __UTF8); 64 CHARSETS.put("utf8", __UTF8); 65 CHARSETS.put("utf-16", __UTF16); 66 CHARSETS.put("utf16", __UTF16); 67 CHARSETS.put("iso-8859-1", __ISO_8859_1); 68 CHARSETS.put("iso_8859_1", __ISO_8859_1); 69 } 70 71 72 /** 73 * Convert alternate charset names (eg utf8) to normalized name (eg UTF-8). 74 * 75 * @param s the charset to normalize 76 * @return the normalized charset (or null if normalized version not found) 77 */ 78 static string normalizeCharset(string s) { 79 string n = CHARSETS.get(s); 80 return (n is null) ? s : n; 81 } 82 83 /** 84 * Convert alternate charset names (eg utf8) to normalized name (eg UTF-8). 85 * 86 * @param s the charset to normalize 87 * @param offset the offset in the charset 88 * @param length the length of the charset in the input param 89 * @return the normalized charset (or null if not found) 90 */ 91 static string normalizeCharset(string s, int offset, int length) { 92 return normalizeCharset(s[offset .. offset+length]); 93 } 94 95 static string asciiToLowerCase(string s) { 96 return toLower(s); 97 } 98 99 static int toInt(string str, int from) { 100 return to!int(str[from..$]); 101 } 102 103 static byte[] getBytes(string s) { 104 return cast(byte[])s.dup; 105 } 106 107 static string randomId(size_t n=10, string str = letters) { 108 import std.random : randomSample; 109 import std.utf : byCodeUnit; 110 return str.byCodeUnit.randomSample(n).to!string; 111 } 112 113 // Splitting 114 // ----------------------------------------------------------------------- 115 116 /** 117 * <p> 118 * Splits the provided text into an array, using whitespace as the 119 * separator. Whitespace is defined by {@link Character#isWhitespace(char)}. 120 * </p> 121 * <p> 122 * <p> 123 * The separator is not included in the returned string array. Adjacent 124 * separators are treated as one separator. For more control over the split 125 * use the StrTokenizer class. 126 * </p> 127 * <p> 128 * <p> 129 * A <code>null</code> input string returns <code>null</code>. 130 * </p> 131 * <p> 132 * <pre> 133 * StringUtils.split(null) = null 134 * StringUtils.split("") = [] 135 * StringUtils.split("abc def") = ["abc", "def"] 136 * StringUtils.split("abc def") = ["abc", "def"] 137 * StringUtils.split(" abc ") = ["abc"] 138 * </pre> 139 * 140 * @param str the string to parse, may be null 141 * @return an array of parsed Strings, <code>null</code> if null string 142 * input 143 */ 144 static string[] split(string str) { 145 return split(str, null, -1); 146 } 147 148 /** 149 * <p> 150 * Splits the provided text into an array, separators specified. This is an 151 * alternative to using StringTokenizer. 152 * </p> 153 * <p> 154 * <p> 155 * The separator is not included in the returned string array. Adjacent 156 * separators are treated as one separator. For more control over the split 157 * use the StrTokenizer class. 158 * </p> 159 * <p> 160 * <p> 161 * A <code>null</code> input string returns <code>null</code>. A 162 * <code>null</code> separatorChars splits on whitespace. 163 * </p> 164 * <p> 165 * <pre> 166 * StringUtils.split(null, *) = null 167 * StringUtils.split("", *) = [] 168 * StringUtils.split("abc def", null) = ["abc", "def"] 169 * StringUtils.split("abc def", " ") = ["abc", "def"] 170 * StringUtils.split("abc def", " ") = ["abc", "def"] 171 * StringUtils.split("ab:cd:ef", ":") = ["ab", "cd", "ef"] 172 * </pre> 173 * 174 * @param str the string to parse, may be null 175 * @param separatorChars the characters used as the delimiters, <code>null</code> 176 * splits on whitespace 177 * @return an array of parsed Strings, <code>null</code> if null string 178 * input 179 */ 180 static string[] split(string str, string separatorChars) { 181 return splitWorker(str, separatorChars, -1, false); 182 } 183 184 /** 185 * <p> 186 * Splits the provided text into an array, separator specified. This is an 187 * alternative to using StringTokenizer. 188 * </p> 189 * <p> 190 * <p> 191 * The separator is not included in the returned string array. Adjacent 192 * separators are treated as one separator. For more control over the split 193 * use the StrTokenizer class. 194 * </p> 195 * <p> 196 * <p> 197 * A <code>null</code> input string returns <code>null</code>. 198 * </p> 199 * <p> 200 * <pre> 201 * StringUtils.split(null, *) = null 202 * StringUtils.split("", *) = [] 203 * StringUtils.split("a.b.c", '.') = ["a", "b", "c"] 204 * StringUtils.split("a..b.c", '.') = ["a", "b", "c"] 205 * StringUtils.split("a:b:c", '.') = ["a:b:c"] 206 * StringUtils.split("a b c", ' ') = ["a", "b", "c"] 207 * </pre> 208 * 209 * @param str the string to parse, may be null 210 * @param separatorChar the character used as the delimiter 211 * @return an array of parsed Strings, <code>null</code> if null string 212 * input 213 * @since 2.0 214 */ 215 static string[] split(string str, char separatorChar) { 216 return splitWorker(str, separatorChar, false); 217 } 218 219 /** 220 * <p> 221 * Splits the provided text into an array with a maximum length, separators 222 * specified. 223 * </p> 224 * <p> 225 * <p> 226 * The separator is not included in the returned string array. Adjacent 227 * separators are treated as one separator. 228 * </p> 229 * <p> 230 * <p> 231 * A <code>null</code> input string returns <code>null</code>. A 232 * <code>null</code> separatorChars splits on whitespace. 233 * </p> 234 * <p> 235 * <p> 236 * If more than <code>max</code> delimited substrings are found, the last 237 * returned string includes all characters after the first 238 * <code>max - 1</code> returned strings (including separator characters). 239 * </p> 240 * <p> 241 * <pre> 242 * StringUtils.split(null, *, *) = null 243 * StringUtils.split("", *, *) = [] 244 * StringUtils.split("ab de fg", null, 0) = ["ab", "cd", "ef"] 245 * StringUtils.split("ab de fg", null, 0) = ["ab", "cd", "ef"] 246 * StringUtils.split("ab:cd:ef", ":", 0) = ["ab", "cd", "ef"] 247 * StringUtils.split("ab:cd:ef", ":", 2) = ["ab", "cd:ef"] 248 * </pre> 249 * 250 * @param str the string to parse, may be null 251 * @param separatorChars the characters used as the delimiters, <code>null</code> 252 * splits on whitespace 253 * @param max the maximum number of elements to include in the array. A zero 254 * or negative value implies no limit 255 * @return an array of parsed Strings, <code>null</code> if null string 256 * input 257 */ 258 static string[] split(string str, string separatorChars, int max) { 259 return splitWorker(str, separatorChars, max, false); 260 } 261 262 /** 263 * Performs the logic for the <code>split</code> and 264 * <code>splitPreserveAllTokens</code> methods that return a maximum array 265 * length. 266 * 267 * @param str the string to parse, may be <code>null</code> 268 * @param separatorChars the separate character 269 * @param max the maximum number of elements to include in the array. A zero 270 * or negative value implies no limit. 271 * @param preserveAllTokens if <code>true</code>, adjacent separators are treated as empty 272 * token separators; if <code>false</code>, adjacent separators 273 * are treated as one separator. 274 * @return an array of parsed Strings, <code>null</code> if null string 275 * input 276 */ 277 private static string[] splitWorker(string str, string separatorChars, int max, bool preserveAllTokens) { 278 // Performance tuned for 2.0 (JDK1.4) 279 // Direct code is quicker than StringTokenizer. 280 // Also, StringTokenizer uses isSpace() not isWhitespace() 281 282 if (str is null) { 283 return null; 284 } 285 int len = cast(int)str.length; 286 if (len == 0) { 287 return EMPTY_STRING_ARRAY; 288 } 289 290 string[] list; // = new ArrayList!(string)(); 291 int sizePlus1 = 1; 292 int i = 0, start = 0; 293 bool match = false; 294 bool lastMatch = false; 295 if (separatorChars is null) { 296 // Null separator means use whitespace 297 while (i < len) { 298 if (std.ascii.isWhite(str[i])) { 299 if (match || preserveAllTokens) { 300 lastMatch = true; 301 if (sizePlus1++ == max) { 302 i = len; 303 lastMatch = false; 304 } 305 list ~= (str.substring(start, i)); 306 match = false; 307 } 308 start = ++i; 309 continue; 310 } 311 lastMatch = false; 312 match = true; 313 i++; 314 } 315 } else if (separatorChars.length == 1) { 316 // Optimise 1 character case 317 char sep = separatorChars[0]; 318 while (i < len) { 319 if (str[i] == sep) { 320 if (match || preserveAllTokens) { 321 lastMatch = true; 322 if (sizePlus1++ == max) { 323 i = len; 324 lastMatch = false; 325 } 326 list ~= (str.substring(start, i)); 327 match = false; 328 } 329 start = ++i; 330 continue; 331 } 332 lastMatch = false; 333 match = true; 334 i++; 335 } 336 } else { 337 // standard case 338 while (i < len) { 339 if (separatorChars.indexOf(str[i]) >= 0) { 340 if (match || preserveAllTokens) { 341 lastMatch = true; 342 if (sizePlus1++ == max) { 343 i = len; 344 lastMatch = false; 345 } 346 list ~= (str.substring(start, i)); 347 match = false; 348 } 349 start = ++i; 350 continue; 351 } 352 lastMatch = false; 353 match = true; 354 i++; 355 } 356 } 357 if (match || (preserveAllTokens && lastMatch)) { 358 list ~= (str.substring(start, i)); 359 } 360 return list; //.toArray(EMPTY_STRING_ARRAY); 361 } 362 363 /** 364 * Performs the logic for the <code>split</code> and 365 * <code>splitPreserveAllTokens</code> methods that do not return a maximum 366 * array length. 367 * 368 * @param str the string to parse, may be <code>null</code> 369 * @param separatorChar the separate character 370 * @param preserveAllTokens if <code>true</code>, adjacent separators are treated as empty 371 * token separators; if <code>false</code>, adjacent separators 372 * are treated as one separator. 373 * @return an array of parsed Strings, <code>null</code> if null string 374 * input 375 */ 376 private static string[] splitWorker(string str, char separatorChar, bool preserveAllTokens) { 377 // Performance tuned for 2.0 (JDK1.4) 378 379 if (str is null) { 380 return null; 381 } 382 int len = cast(int)str.length; 383 if (len == 0) { 384 return EMPTY_STRING_ARRAY; 385 } 386 string[] list; // = new ArrayList!(string)(); 387 int i = 0, start = 0; 388 bool match = false; 389 bool lastMatch = false; 390 while (i < len) { 391 if (str[i] == separatorChar) { 392 if (match || preserveAllTokens) { 393 list ~= (str.substring(start, i)); 394 match = false; 395 lastMatch = true; 396 } 397 start = ++i; 398 continue; 399 } 400 lastMatch = false; 401 match = true; 402 i++; 403 } 404 if (match || (preserveAllTokens && lastMatch)) { 405 list ~= (str.substring(start, i)); 406 } 407 return list; 408 } 409 410 411 412 /** 413 * Copy the given Enumeration into a {@code string} array. 414 * The Enumeration must contain {@code string} elements only. 415 * @param enumeration the Enumeration to copy 416 * @return the {@code string} array 417 */ 418 static string[] toStringArray(InputRange!string range) { 419 Array!string buffer; 420 foreach(string s; range) { 421 buffer.insertBack(s); 422 } 423 return buffer.array; 424 } 425 426 427 /** 428 * Convert a {@code string} array into a delimited {@code string} (e.g. CSV). 429 * <p>Useful for {@code toString()} implementations. 430 * @param arr the array to display (potentially {@code null} or empty) 431 * @param delim the delimiter to use (typically a ",") 432 * @return the delimited {@code string} 433 */ 434 static string arrayToDelimitedString(string[] arr, string delim) { 435 if (arr.length == 0) { 436 return ""; 437 } 438 if (arr.length == 1) { 439 return arr[0]; 440 } 441 442 Appender!string sb; 443 for (size_t i = 0; i < arr.length; i++) { 444 if (i > 0) { 445 sb.put(delim); 446 } 447 sb.put(arr[i]); 448 } 449 return sb.data; 450 } 451 452 /** 453 * Convert a {@code string} array into a comma delimited {@code string} 454 * (i.e., CSV). 455 * <p>Useful for {@code toString()} implementations. 456 * @param arr the array to display (potentially {@code null} or empty) 457 * @return the delimited {@code string} 458 */ 459 static string arrayToCommaDelimitedString(string[] arr) { 460 return arrayToDelimitedString(arr, ","); 461 } 462 463 464 /** 465 * Convert a comma delimited list (e.g., a row from a CSV file) into an 466 * array of strings. 467 * @param str the input {@code string} (potentially {@code null} or empty) 468 * @return an array of strings, or the empty array in case of empty input 469 */ 470 static string[] commaDelimitedListToStringArray(string str) { 471 return delimitedListToStringArray(str, ","); 472 } 473 474 475 /** 476 * Take a {@code string} that is a delimited list and convert it into a 477 * {@code string} array. 478 * <p>A single {@code delimiter} may consist of more than one character, 479 * but it will still be considered as a single delimiter string, rather 480 * than as bunch of potential delimiter characters, in contrast to 481 * {@link #tokenizeToStringArray}. 482 * @param str the input {@code string} (potentially {@code null} or empty) 483 * @param delimiter the delimiter between elements (this is a single delimiter, 484 * rather than a bunch individual delimiter characters) 485 * @return an array of the tokens in the list 486 * @see #tokenizeToStringArray 487 */ 488 static string[] delimitedListToStringArray(string str, string delimiter) { 489 return delimitedListToStringArray(str, delimiter, null); 490 } 491 492 /** 493 * Take a {@code string} that is a delimited list and convert it into 494 * a {@code string} array. 495 * <p>A single {@code delimiter} may consist of more than one character, 496 * but it will still be considered as a single delimiter string, rather 497 * than as bunch of potential delimiter characters, in contrast to 498 * {@link #tokenizeToStringArray}. 499 * @param str the input {@code string} (potentially {@code null} or empty) 500 * @param delimiter the delimiter between elements (this is a single delimiter, 501 * rather than a bunch individual delimiter characters) 502 * @param charsToDelete a set of characters to delete; useful for deleting unwanted 503 * line breaks: e.g. "\r\n\f" will delete all new lines and line feeds in a {@code string} 504 * @return an array of the tokens in the list 505 * @see #tokenizeToStringArray 506 */ 507 static string[] delimitedListToStringArray(string str, 508 string delimiter, string charsToDelete) { 509 510 if (str.empty()) { 511 return []; 512 } 513 if (delimiter is null) { 514 return [str]; 515 } 516 517 Array!string result; 518 if ("" == delimiter) { 519 for (size_t i = 0; i < str.length; i++) { 520 result.insertBack(deleteAny(str[i .. i + 1], charsToDelete)); 521 } 522 } 523 else { 524 size_t pos = 0; 525 ptrdiff_t delPos; 526 while ((delPos = str.indexOf(delimiter, pos)) != -1) { 527 result.insertBack(deleteAny(str[pos .. delPos], charsToDelete)); 528 pos = delPos + delimiter.length; 529 } 530 if (str.length > 0 && pos <= str.length) { 531 // Add rest of string, but not in case of empty input. 532 result.insertBack(deleteAny(str[pos .. $], charsToDelete)); 533 } 534 } 535 return result.array; 536 } 537 538 539 /** 540 * Delete any character in a given {@code string}. 541 * @param inString the original {@code string} 542 * @param charsToDelete a set of characters to delete. 543 * E.g. "az\n" will delete 'a's, 'z's and new lines. 544 * @return the resulting {@code string} 545 */ 546 static string deleteAny(string inString, string charsToDelete) { 547 if (inString.empty() || charsToDelete.empty()) { 548 return inString; 549 } 550 551 Appender!string sb; 552 for (size_t i = 0; i < inString.length; i++) { 553 char c = inString[i]; 554 if (charsToDelete.indexOf(c) == -1) { 555 sb.put(c); 556 } 557 } 558 return sb.data; 559 } 560 561 }