1 /*
2  * Hunt - A refined core library for D programming language.
3  *
4  * Copyright (C) 2018-2019 HuntLabs
5  *
6  * Website: https://www.huntlabs.net/
7  *
8  * Licensed under the Apache-2.0 License.
9  *
10  */
11 
12 module hunt.text.StringTokenizer;
13 
14 
15 import std.string;
16 
17 import hunt.util.Common;
18 import hunt.Char;
19 import hunt.Exceptions;
20 import hunt.text.Common;
21 
22 /**
23  * The string tokenizer class allows an application to break a
24  * string into tokens. The tokenization method is much simpler than
25  * the one used by the <code>StreamTokenizer</code> class. The
26  * <code>StringTokenizer</code> methods do not distinguish among
27  * identifiers, numbers, and quoted strings, nor do they recognize
28  * and skip comments.
29  * <p>
30  * The set of delimiters (the characters that separate tokens) may
31  * be specified either at creation time or on a per-token basis.
32  * <p>
33  * An instance of <code>StringTokenizer</code> behaves in one of two
34  * ways, depending on whether it was created with the
35  * <code>returnDelims</code> flag having the value <code>true</code>
36  * or <code>false</code>:
37  * <ul>
38  * <li>If the flag is <code>false</code>, delimiter characters serve to
39  *     separate tokens. A token is a maximal sequence of consecutive
40  *     characters that are not delimiters.
41  * <li>If the flag is <code>true</code>, delimiter characters are themselves
42  *     considered to be tokens. A token is thus either one delimiter
43  *     character, or a maximal sequence of consecutive characters that are
44  *     not delimiters.
45  * </ul><p>
46  * A <tt>StringTokenizer</tt> object internally maintains a current
47  * position within the string to be tokenized. Some operations advance this
48  * current position past the characters processed.<p>
49  * A token is returned by taking a substring of the string that was used to
50  * create the <tt>StringTokenizer</tt> object.
51  * <p>
52  * The following is one example of the use of the tokenizer. The code:
53  * <blockquote><pre>
54  *     StringTokenizer st = new StringTokenizer("this is a test");
55  *     while (st.hasMoreTokens()) {
56  *         System.out.println(st.nextToken());
57  *     }
58  * </pre></blockquote>
59  * <p>
60  * prints the following output:
61  * <blockquote><pre>
62  *     this
63  *     is
64  *     a
65  *     test
66  * </pre></blockquote>
67  *
68  * <p>
69  * <tt>StringTokenizer</tt> is a legacy class that is retained for
70  * compatibility reasons although its use is discouraged in new code. It is
71  * recommended that anyone seeking this functionality use the <tt>split</tt>
72  * method of <tt>string</tt> or the java.util.regex package instead.
73  * <p>
74  * The following example illustrates how the <tt>string.split</tt>
75  * method can be used to break up a string into its basic tokens:
76  * <blockquote><pre>
77  *     string[] result = "this is a test".split("\\s");
78  *     for (int x=0; x&lt;result.length; x++)
79  *         System.out.println(result[x]);
80  * </pre></blockquote>
81  * <p>
82  * prints the following output:
83  * <blockquote><pre>
84  *     this
85  *     is
86  *     a
87  *     test
88  * </pre></blockquote>
89  *
90  * @author  unascribed
91  * @see     java.io.StreamTokenizer
92  * @since   JDK1.0
93  */
94 
95 class StringTokenizer : Iterable!string {
96     private int currentPosition;
97     private int newPosition;
98     private int maxPosition;
99     private string str;
100     private string delimiters;
101     private bool retDelims;
102     private bool delimsChanged;
103 
104     /**
105      * maxDelimCodePoint stores the value of the delimiter character with the
106      * highest value. It is used to optimize the detection of delimiter
107      * characters.
108      *
109      * It is unlikely to provide any optimization benefit in the
110      * hasSurrogates case because most string characters will be
111      * smaller than the limit, but we keep it so that the two code
112      * paths remain similar.
113      */
114     private int maxDelimCodePoint;
115 
116     /**
117      * If delimiters include any surrogates (including surrogate
118      * pairs), hasSurrogates is true and the tokenizer uses the
119      * different code path. This is because string.indexOf(int)
120      * doesn't handle unpaired surrogates as a single character.
121      */
122     private bool hasSurrogates = false;
123 
124     /**
125      * When hasSurrogates is true, delimiters are converted to code
126      * points and isDelimiter(int) is used to determine if the given
127      * codepoint is a delimiter.
128      */
129     private int[] delimiterCodePoints;
130 
131     /**
132      * Set maxDelimCodePoint to the highest char in the delimiter set.
133      */
134     private void setMaxDelimCodePoint() {
135         if (delimiters is null) {
136             maxDelimCodePoint = 0;
137             return;
138         }
139 
140         int m = 0;
141         int c;
142         int count = 0;
143         for (int i = 0; i < delimiters.length; i += Char.charCount(c)) {
144             c = delimiters[i];
145             // FIXME: Needing refactor or cleanup -@zxp at 12/28/2018, 2:53:32 PM
146             // 
147             // if (c >= Char.MIN_HIGH_SURROGATE && c <= Char.MAX_LOW_SURROGATE) {
148             //     c = delimiters.codePointAt(i);
149             //     hasSurrogates = true;
150             // }
151             if (m < c)
152                 m = c;
153             count++;
154         }
155         maxDelimCodePoint = m;
156 
157         // if (hasSurrogates) {
158         //     delimiterCodePoints = new int[count];
159         //     for (int i = 0, j = 0; i < count; i++, j += Char.charCount(c)) {
160         //         c = delimiters.codePointAt(j);
161         //         delimiterCodePoints[i] = c;
162         //     }
163         // }
164     }
165 
166     /**
167      * Constructs a string tokenizer for the specified string. All
168      * characters in the <code>delim</code> argument are the delimiters
169      * for separating tokens.
170      * <p>
171      * If the <code>returnDelims</code> flag is <code>true</code>, then
172      * the delimiter characters are also returned as tokens. Each
173      * delimiter is returned as a string of length one. If the flag is
174      * <code>false</code>, the delimiter characters are skipped and only
175      * serve as separators between tokens.
176      * <p>
177      * Note that if <tt>delim</tt> is <tt>null</tt>, this constructor does
178      * not throw an exception. However, trying to invoke other methods on the
179      * resulting <tt>StringTokenizer</tt> may result in a
180      * <tt>NullPointerException</tt>.
181      *
182      * @param   str            a string to be parsed.
183      * @param   delim          the delimiters.
184      * @param   returnDelims   flag indicating whether to return the delimiters
185      *                         as tokens.
186      * @exception NullPointerException if str is <CODE>null</CODE>
187      */
188     this(string str, string delim, bool returnDelims) {
189         currentPosition = 0;
190         newPosition = -1;
191         delimsChanged = false;
192         this.str = str;
193         maxPosition = cast(int)str.length;
194         delimiters = delim;
195         retDelims = returnDelims;
196         setMaxDelimCodePoint();
197     }
198 
199     /**
200      * Constructs a string tokenizer for the specified string. The
201      * characters in the <code>delim</code> argument are the delimiters
202      * for separating tokens. Delimiter characters themselves will not
203      * be treated as tokens.
204      * <p>
205      * Note that if <tt>delim</tt> is <tt>null</tt>, this constructor does
206      * not throw an exception. However, trying to invoke other methods on the
207      * resulting <tt>StringTokenizer</tt> may result in a
208      * <tt>NullPointerException</tt>.
209      *
210      * @param   str     a string to be parsed.
211      * @param   delim   the delimiters.
212      * @exception NullPointerException if str is <CODE>null</CODE>
213      */
214     this(string str, string delim) {
215         this(str, delim, false);
216     }
217 
218     /**
219      * Constructs a string tokenizer for the specified string. The
220      * tokenizer uses the default delimiter set, which is
221      * <code>"&nbsp;&#92;t&#92;n&#92;r&#92;f"</code>: the space character,
222      * the tab character, the newline character, the carriage-return character,
223      * and the form-feed character. Delimiter characters themselves will
224      * not be treated as tokens.
225      *
226      * @param   str   a string to be parsed.
227      * @exception NullPointerException if str is <CODE>null</CODE>
228      */
229     this(string str) {
230         this(str, " \t\n\r\f", false);
231     }
232 
233     /**
234      * Skips delimiters starting from the specified position. If retDelims
235      * is false, returns the index of the first non-delimiter character at or
236      * after startPos. If retDelims is true, startPos is returned.
237      */
238     private int skipDelimiters(int startPos) {
239         if (delimiters is null)
240             throw new NullPointerException();
241 
242         int position = startPos;
243         while (!retDelims && position < maxPosition) {
244             if (!hasSurrogates) {
245                 char c = str[position];
246                 if ((c > maxDelimCodePoint) || (delimiters.indexOf(c) < 0))
247                     break;
248                 position++;
249             } else {
250                 throw new NotSupportedException();
251                 // int c = str.codePointAt(position);
252                 // if ((c > maxDelimCodePoint) || !isDelimiter(c)) {
253                 //     break;
254                 // }
255                 // position += Char.charCount(c);
256             }
257         }
258         return position;
259     }
260 
261     /**
262      * Skips ahead from startPos and returns the index of the next delimiter
263      * character encountered, or maxPosition if no such delimiter is found.
264      */
265     private int scanToken(int startPos) {
266         int position = startPos;
267 
268         while (position < maxPosition) {
269             if (!hasSurrogates) {
270                 char c = str.charAt(position);
271                 if ((c <= maxDelimCodePoint) && (delimiters.indexOf(c) >= 0))
272                     break;
273                 position++;
274             } else {
275 
276                 throw new NotSupportedException();
277                 // int c = str.codePointAt(position);
278                 // if ((c <= maxDelimCodePoint) && isDelimiter(c))
279                 //     break;
280                 // position += Char.charCount(c);
281             }
282         }
283 
284         if (retDelims && (startPos == position)) {
285             if (!hasSurrogates) {
286                 char c = str.charAt(position);
287                 if ((c <= maxDelimCodePoint) && (delimiters.indexOf(c) >= 0))
288                     position++;
289             } else {
290 
291                 throw new NotSupportedException();
292                 // int c = str.codePointAt(position);
293                 // if ((c <= maxDelimCodePoint) && isDelimiter(c))
294                 //     position += Char.charCount(c);
295             }
296         }
297         return position;
298     }
299 
300     private bool isDelimiter(int codePoint) {
301         for (int i = 0; i < delimiterCodePoints.length; i++) {
302             if (delimiterCodePoints[i] == codePoint) {
303                 return true;
304             }
305         }
306         return false;
307     }
308 
309     /**
310      * Tests if there are more tokens available from this tokenizer's string.
311      * If this method returns <tt>true</tt>, then a subsequent call to
312      * <tt>nextToken</tt> with no argument will successfully return a token.
313      *
314      * @return  <code>true</code> if and only if there is at least one token
315      *          in the string after the current position; <code>false</code>
316      *          otherwise.
317      */
318     bool hasMoreTokens() {
319         /*
320          * Temporarily store this position and use it in the following
321          * nextToken() method only if the delimiters haven't been changed in
322          * that nextToken() invocation.
323          */
324         newPosition = skipDelimiters(currentPosition);
325         return (newPosition < maxPosition);
326     }
327 
328     /**
329      * Returns the next token from this string tokenizer.
330      *
331      * @return     the next token from this string tokenizer.
332      * @exception  NoSuchElementException  if there are no more tokens in this
333      *               tokenizer's string.
334      */
335     string nextToken() {
336         /*
337          * If next position already computed in hasMoreElements() and
338          * delimiters have changed between the computation and this invocation,
339          * then use the computed value.
340          */
341 
342         currentPosition = (newPosition >= 0 && !delimsChanged) ?
343             newPosition : skipDelimiters(currentPosition);
344 
345         /* Reset these anyway */
346         delimsChanged = false;
347         newPosition = -1;
348 
349         if (currentPosition >= maxPosition)
350             throw new NoSuchElementException("");
351         int start = currentPosition;
352         currentPosition = scanToken(currentPosition);
353         return str.substring(start, currentPosition);
354     }
355 
356     /**
357      * Returns the next token in this string tokenizer's string. First,
358      * the set of characters considered to be delimiters by this
359      * <tt>StringTokenizer</tt> object is changed to be the characters in
360      * the string <tt>delim</tt>. Then the next token in the string
361      * after the current position is returned. The current position is
362      * advanced beyond the recognized token.  The new delimiter set
363      * remains the default after this call.
364      *
365      * @param      delim   the new delimiters.
366      * @return     the next token, after switching to the new delimiter set.
367      * @exception  NoSuchElementException  if there are no more tokens in this
368      *               tokenizer's string.
369      * @exception NullPointerException if delim is <CODE>null</CODE>
370      */
371     string nextToken(string delim) {
372         delimiters = delim;
373 
374         /* delimiter string specified, so set the appropriate flag. */
375         delimsChanged = true;
376 
377         setMaxDelimCodePoint();
378         return nextToken();
379     }
380 
381     int opApply(scope int delegate(ref string) dg)
382     {
383         int result = 0;
384         while(hasMoreTokens && result == 0) {
385             string s = nextToken();
386             result = dg(s);
387         }
388         return result;
389     }
390 
391     /**
392      * Calculates the number of times that this tokenizer's
393      * <code>nextToken</code> method can be called before it generates an
394      * exception. The current position is not advanced.
395      *
396      * @return  the number of tokens remaining in the string using the current
397      *          delimiter set.
398      * @see     java.util.StringTokenizer#nextToken()
399      */
400     int countTokens() {
401         int count = 0;
402         int currpos = currentPosition;
403         while (currpos < maxPosition) {
404             currpos = skipDelimiters(currpos);
405             if (currpos >= maxPosition)
406                 break;
407             currpos = scanToken(currpos);
408             count++;
409         }
410         return count;
411     }
412 }