1 /* 2 * Hunt - A refined core library for D programming language. 3 * 4 * Copyright (C) 2018-2019 HuntLabs 5 * 6 * Website: https://www.huntlabs.net/ 7 * 8 * Licensed under the Apache-2.0 License. 9 * 10 */ 11 12 module hunt.text.StringTokenizer; 13 14 15 import std.string; 16 17 import hunt.util.Common; 18 import hunt.Char; 19 import hunt.Exceptions; 20 import hunt.text.Common; 21 22 /** 23 * The string tokenizer class allows an application to break a 24 * string into tokens. The tokenization method is much simpler than 25 * the one used by the <code>StreamTokenizer</code> class. The 26 * <code>StringTokenizer</code> methods do not distinguish among 27 * identifiers, numbers, and quoted strings, nor do they recognize 28 * and skip comments. 29 * <p> 30 * The set of delimiters (the characters that separate tokens) may 31 * be specified either at creation time or on a per-token basis. 32 * <p> 33 * An instance of <code>StringTokenizer</code> behaves in one of two 34 * ways, depending on whether it was created with the 35 * <code>returnDelims</code> flag having the value <code>true</code> 36 * or <code>false</code>: 37 * <ul> 38 * <li>If the flag is <code>false</code>, delimiter characters serve to 39 * separate tokens. A token is a maximal sequence of consecutive 40 * characters that are not delimiters. 41 * <li>If the flag is <code>true</code>, delimiter characters are themselves 42 * considered to be tokens. A token is thus either one delimiter 43 * character, or a maximal sequence of consecutive characters that are 44 * not delimiters. 45 * </ul><p> 46 * A <tt>StringTokenizer</tt> object internally maintains a current 47 * position within the string to be tokenized. Some operations advance this 48 * current position past the characters processed.<p> 49 * A token is returned by taking a substring of the string that was used to 50 * create the <tt>StringTokenizer</tt> object. 51 * <p> 52 * The following is one example of the use of the tokenizer. The code: 53 * <blockquote><pre> 54 * StringTokenizer st = new StringTokenizer("this is a test"); 55 * while (st.hasMoreTokens()) { 56 * System.out.println(st.nextToken()); 57 * } 58 * </pre></blockquote> 59 * <p> 60 * prints the following output: 61 * <blockquote><pre> 62 * this 63 * is 64 * a 65 * test 66 * </pre></blockquote> 67 * 68 * <p> 69 * <tt>StringTokenizer</tt> is a legacy class that is retained for 70 * compatibility reasons although its use is discouraged in new code. It is 71 * recommended that anyone seeking this functionality use the <tt>split</tt> 72 * method of <tt>string</tt> or the java.util.regex package instead. 73 * <p> 74 * The following example illustrates how the <tt>string.split</tt> 75 * method can be used to break up a string into its basic tokens: 76 * <blockquote><pre> 77 * string[] result = "this is a test".split("\\s"); 78 * for (int x=0; x<result.length; x++) 79 * System.out.println(result[x]); 80 * </pre></blockquote> 81 * <p> 82 * prints the following output: 83 * <blockquote><pre> 84 * this 85 * is 86 * a 87 * test 88 * </pre></blockquote> 89 * 90 * @author unascribed 91 * @see java.io.StreamTokenizer 92 * @since JDK1.0 93 */ 94 95 class StringTokenizer : Iterable!string { 96 private int currentPosition; 97 private int newPosition; 98 private int maxPosition; 99 private string str; 100 private string delimiters; 101 private bool retDelims; 102 private bool delimsChanged; 103 104 /** 105 * maxDelimCodePoint stores the value of the delimiter character with the 106 * highest value. It is used to optimize the detection of delimiter 107 * characters. 108 * 109 * It is unlikely to provide any optimization benefit in the 110 * hasSurrogates case because most string characters will be 111 * smaller than the limit, but we keep it so that the two code 112 * paths remain similar. 113 */ 114 private int maxDelimCodePoint; 115 116 /** 117 * If delimiters include any surrogates (including surrogate 118 * pairs), hasSurrogates is true and the tokenizer uses the 119 * different code path. This is because string.indexOf(int) 120 * doesn't handle unpaired surrogates as a single character. 121 */ 122 private bool hasSurrogates = false; 123 124 /** 125 * When hasSurrogates is true, delimiters are converted to code 126 * points and isDelimiter(int) is used to determine if the given 127 * codepoint is a delimiter. 128 */ 129 private int[] delimiterCodePoints; 130 131 /** 132 * Set maxDelimCodePoint to the highest char in the delimiter set. 133 */ 134 private void setMaxDelimCodePoint() { 135 if (delimiters is null) { 136 maxDelimCodePoint = 0; 137 return; 138 } 139 140 int m = 0; 141 int c; 142 int count = 0; 143 for (int i = 0; i < delimiters.length; i += Char.charCount(c)) { 144 c = delimiters[i]; 145 // FIXME: Needing refactor or cleanup -@zxp at 12/28/2018, 2:53:32 PM 146 // 147 // if (c >= Char.MIN_HIGH_SURROGATE && c <= Char.MAX_LOW_SURROGATE) { 148 // c = delimiters.codePointAt(i); 149 // hasSurrogates = true; 150 // } 151 if (m < c) 152 m = c; 153 count++; 154 } 155 maxDelimCodePoint = m; 156 157 // if (hasSurrogates) { 158 // delimiterCodePoints = new int[count]; 159 // for (int i = 0, j = 0; i < count; i++, j += Char.charCount(c)) { 160 // c = delimiters.codePointAt(j); 161 // delimiterCodePoints[i] = c; 162 // } 163 // } 164 } 165 166 /** 167 * Constructs a string tokenizer for the specified string. All 168 * characters in the <code>delim</code> argument are the delimiters 169 * for separating tokens. 170 * <p> 171 * If the <code>returnDelims</code> flag is <code>true</code>, then 172 * the delimiter characters are also returned as tokens. Each 173 * delimiter is returned as a string of length one. If the flag is 174 * <code>false</code>, the delimiter characters are skipped and only 175 * serve as separators between tokens. 176 * <p> 177 * Note that if <tt>delim</tt> is <tt>null</tt>, this constructor does 178 * not throw an exception. However, trying to invoke other methods on the 179 * resulting <tt>StringTokenizer</tt> may result in a 180 * <tt>NullPointerException</tt>. 181 * 182 * @param str a string to be parsed. 183 * @param delim the delimiters. 184 * @param returnDelims flag indicating whether to return the delimiters 185 * as tokens. 186 * @exception NullPointerException if str is <CODE>null</CODE> 187 */ 188 this(string str, string delim, bool returnDelims) { 189 currentPosition = 0; 190 newPosition = -1; 191 delimsChanged = false; 192 this.str = str; 193 maxPosition = cast(int)str.length; 194 delimiters = delim; 195 retDelims = returnDelims; 196 setMaxDelimCodePoint(); 197 } 198 199 /** 200 * Constructs a string tokenizer for the specified string. The 201 * characters in the <code>delim</code> argument are the delimiters 202 * for separating tokens. Delimiter characters themselves will not 203 * be treated as tokens. 204 * <p> 205 * Note that if <tt>delim</tt> is <tt>null</tt>, this constructor does 206 * not throw an exception. However, trying to invoke other methods on the 207 * resulting <tt>StringTokenizer</tt> may result in a 208 * <tt>NullPointerException</tt>. 209 * 210 * @param str a string to be parsed. 211 * @param delim the delimiters. 212 * @exception NullPointerException if str is <CODE>null</CODE> 213 */ 214 this(string str, string delim) { 215 this(str, delim, false); 216 } 217 218 /** 219 * Constructs a string tokenizer for the specified string. The 220 * tokenizer uses the default delimiter set, which is 221 * <code>" \t\n\r\f"</code>: the space character, 222 * the tab character, the newline character, the carriage-return character, 223 * and the form-feed character. Delimiter characters themselves will 224 * not be treated as tokens. 225 * 226 * @param str a string to be parsed. 227 * @exception NullPointerException if str is <CODE>null</CODE> 228 */ 229 this(string str) { 230 this(str, " \t\n\r\f", false); 231 } 232 233 /** 234 * Skips delimiters starting from the specified position. If retDelims 235 * is false, returns the index of the first non-delimiter character at or 236 * after startPos. If retDelims is true, startPos is returned. 237 */ 238 private int skipDelimiters(int startPos) { 239 if (delimiters is null) 240 throw new NullPointerException(); 241 242 int position = startPos; 243 while (!retDelims && position < maxPosition) { 244 if (!hasSurrogates) { 245 char c = str[position]; 246 if ((c > maxDelimCodePoint) || (delimiters.indexOf(c) < 0)) 247 break; 248 position++; 249 } else { 250 throw new NotSupportedException(); 251 // int c = str.codePointAt(position); 252 // if ((c > maxDelimCodePoint) || !isDelimiter(c)) { 253 // break; 254 // } 255 // position += Char.charCount(c); 256 } 257 } 258 return position; 259 } 260 261 /** 262 * Skips ahead from startPos and returns the index of the next delimiter 263 * character encountered, or maxPosition if no such delimiter is found. 264 */ 265 private int scanToken(int startPos) { 266 int position = startPos; 267 268 while (position < maxPosition) { 269 if (!hasSurrogates) { 270 char c = str.charAt(position); 271 if ((c <= maxDelimCodePoint) && (delimiters.indexOf(c) >= 0)) 272 break; 273 position++; 274 } else { 275 276 throw new NotSupportedException(); 277 // int c = str.codePointAt(position); 278 // if ((c <= maxDelimCodePoint) && isDelimiter(c)) 279 // break; 280 // position += Char.charCount(c); 281 } 282 } 283 284 if (retDelims && (startPos == position)) { 285 if (!hasSurrogates) { 286 char c = str.charAt(position); 287 if ((c <= maxDelimCodePoint) && (delimiters.indexOf(c) >= 0)) 288 position++; 289 } else { 290 291 throw new NotSupportedException(); 292 // int c = str.codePointAt(position); 293 // if ((c <= maxDelimCodePoint) && isDelimiter(c)) 294 // position += Char.charCount(c); 295 } 296 } 297 return position; 298 } 299 300 private bool isDelimiter(int codePoint) { 301 for (int i = 0; i < delimiterCodePoints.length; i++) { 302 if (delimiterCodePoints[i] == codePoint) { 303 return true; 304 } 305 } 306 return false; 307 } 308 309 /** 310 * Tests if there are more tokens available from this tokenizer's string. 311 * If this method returns <tt>true</tt>, then a subsequent call to 312 * <tt>nextToken</tt> with no argument will successfully return a token. 313 * 314 * @return <code>true</code> if and only if there is at least one token 315 * in the string after the current position; <code>false</code> 316 * otherwise. 317 */ 318 bool hasMoreTokens() { 319 /* 320 * Temporarily store this position and use it in the following 321 * nextToken() method only if the delimiters haven't been changed in 322 * that nextToken() invocation. 323 */ 324 newPosition = skipDelimiters(currentPosition); 325 return (newPosition < maxPosition); 326 } 327 328 /** 329 * Returns the next token from this string tokenizer. 330 * 331 * @return the next token from this string tokenizer. 332 * @exception NoSuchElementException if there are no more tokens in this 333 * tokenizer's string. 334 */ 335 string nextToken() { 336 /* 337 * If next position already computed in hasMoreElements() and 338 * delimiters have changed between the computation and this invocation, 339 * then use the computed value. 340 */ 341 342 currentPosition = (newPosition >= 0 && !delimsChanged) ? 343 newPosition : skipDelimiters(currentPosition); 344 345 /* Reset these anyway */ 346 delimsChanged = false; 347 newPosition = -1; 348 349 if (currentPosition >= maxPosition) 350 throw new NoSuchElementException(""); 351 int start = currentPosition; 352 currentPosition = scanToken(currentPosition); 353 return str.substring(start, currentPosition); 354 } 355 356 /** 357 * Returns the next token in this string tokenizer's string. First, 358 * the set of characters considered to be delimiters by this 359 * <tt>StringTokenizer</tt> object is changed to be the characters in 360 * the string <tt>delim</tt>. Then the next token in the string 361 * after the current position is returned. The current position is 362 * advanced beyond the recognized token. The new delimiter set 363 * remains the default after this call. 364 * 365 * @param delim the new delimiters. 366 * @return the next token, after switching to the new delimiter set. 367 * @exception NoSuchElementException if there are no more tokens in this 368 * tokenizer's string. 369 * @exception NullPointerException if delim is <CODE>null</CODE> 370 */ 371 string nextToken(string delim) { 372 delimiters = delim; 373 374 /* delimiter string specified, so set the appropriate flag. */ 375 delimsChanged = true; 376 377 setMaxDelimCodePoint(); 378 return nextToken(); 379 } 380 381 int opApply(scope int delegate(ref string) dg) 382 { 383 int result = 0; 384 while(hasMoreTokens && result == 0) { 385 string s = nextToken(); 386 result = dg(s); 387 } 388 return result; 389 } 390 391 /** 392 * Calculates the number of times that this tokenizer's 393 * <code>nextToken</code> method can be called before it generates an 394 * exception. The current position is not advanced. 395 * 396 * @return the number of tokens remaining in the string using the current 397 * delimiter set. 398 * @see java.util.StringTokenizer#nextToken() 399 */ 400 int countTokens() { 401 int count = 0; 402 int currpos = currentPosition; 403 while (currpos < maxPosition) { 404 currpos = skipDelimiters(currpos); 405 if (currpos >= maxPosition) 406 break; 407 currpos = scanToken(currpos); 408 count++; 409 } 410 return count; 411 } 412 }