1 /*
2  * Hunt - A refined core library for D programming language.
3  *
4  * Copyright (C) 2018-2019 HuntLabs
5  *
6  * Website: https://www.huntlabs.net/
7  *
8  * Licensed under the Apache-2.0 License.
9  *
10  */
11 
12 module hunt.Char;
13 
14 import hunt.Nullable;
15 import hunt.Exceptions;
16 import hunt.text.Common;
17 /**
18  * The {@code Character} class wraps a value of the primitive
19  * type {@code char} in an object. An object of type
20  * {@code Character} contains a single field whose type is
21  * {@code char}.
22  * <p>
23  * In addition, this class provides several methods for determining
24  * a character's category (lowercase letter, digit, etc.) and for converting
25  * characters from uppercase to lowercase and vice versa.
26  * <p>
27  * Character information is based on the Unicode Standard, version 8.0.0.
28  * <p>
29  * The methods and data of class {@code Character} are defined by
30  * the information in the <i>UnicodeData</i> file that is part of the
31  * Unicode Character Database maintained by the Unicode
32  * Consortium. This file specifies various properties including name
33  * and general category for every defined Unicode code point or
34  * character range.
35  * <p>
36  * The file and its description are available from the Unicode Consortium at:
37  * <ul>
38  * <li><a href="http://www.unicode.org">http://www.unicode.org</a>
39  * </ul>
40  *
41  * <h3><a id="unicode">Unicode Character Representations</a></h3>
42  *
43  * <p>The {@code char} data type (and therefore the value that a
44  * {@code Character} object encapsulates) are based on the
45  * original Unicode specification, which defined characters as
46  * fixed-width 16-bit entities. The Unicode Standard has since been
47  * changed to allow for characters whose representation requires more
48  * than 16 bits.  The range of legal <em>code point</em>s is now
49  * U+0000 to U+10FFFF, known as <em>Unicode scalar value</em>.
50  * (Refer to the <a
51  * href="http://www.unicode.org/reports/tr27/#notation"><i>
52  * definition</i></a> of the U+<i>n</i> notation in the Unicode
53  * Standard.)
54  *
55  * <p><a id="BMP">The set of characters from U+0000 to U+FFFF</a> is
56  * sometimes referred to as the <em>Basic Multilingual Plane (BMP)</em>.
57  * <a id="supplementary">Characters</a> whose code points are greater
58  * than U+FFFF are called <em>supplementary character</em>s.  The Java
59  * platform uses the UTF-16 representation in {@code char} arrays and
60  * in the {@code string} and {@code StringBuffer} classes. In
61  * this representation, supplementary characters are represented as a pair
62  * of {@code char} values, the first from the <em>high-surrogates</em>
63  * range, (&#92;uD800-&#92;uDBFF), the second from the
64  * <em>low-surrogates</em> range (&#92;uDC00-&#92;uDFFF).
65  *
66  * <p>A {@code char} value, therefore, represents Basic
67  * Multilingual Plane (BMP) code points, including the surrogate
68  * code points, or code units of the UTF-16 encoding. An
69  * {@code int} value represents all Unicode code points,
70  * including supplementary code points. The lower (least significant)
71  * 21 bits of {@code int} are used to represent Unicode code
72  * points and the upper (most significant) 11 bits must be zero.
73  * Unless otherwise specified, the behavior with respect to
74  * supplementary characters and surrogate {@code char} values is
75  * as follows:
76  *
77  * <ul>
78  * <li>The methods that only accept a {@code char} value cannot support
79  * supplementary characters. They treat {@code char} values from the
80  * surrogate ranges as undefined characters. For example,
81  * {@code Character.isLetter('\u005CuD840')} returns {@code false}, even though
82  * this specific value if followed by any low-surrogate value in a string
83  * would represent a letter.
84  *
85  * <li>The methods that accept an {@code int} value support all
86  * Unicode characters, including supplementary characters. For
87  * example, {@code Character.isLetter(0x2F81A)} returns
88  * {@code true} because the code point value represents a letter
89  * (a CJK ideograph).
90  * </ul>
91  *
92  * <p>In the Java SE API documentation, <em>Unicode code point</em> is
93  * used for character values in the range between U+0000 and U+10FFFF,
94  * and <em>Unicode code unit</em> is used for 16-bit
95  * {@code char} values that are code units of the <em>UTF-16</em>
96  * encoding. For more information on Unicode terminology, refer to the
97  * <a href="http://www.unicode.org/glossary/">Unicode Glossary</a>.
98  *
99  * @author  Lee Boynton
100  * @author  Guy Steele
101  * @author  Akira Tanaka
102  * @author  Martin Buchholz
103  * @author  Ulf Zibis
104  * @since   1.0
105  */
106 class Char : Nullable!char {
107     /**
108      * The minimum radix available for conversion to and from strings.
109      * The constant value of this field is the smallest value permitted
110      * for the radix argument in radix-conversion methods such as the
111      * {@code digit} method, the {@code forDigit} method, and the
112      * {@code toString} method of class {@code Integer}.
113      *
114      * @see     Character#digit(char, int)
115      * @see     Character#forDigit(int, int)
116      * @see     Integer#toString(int, int)
117      * @see     Integer#valueOf(string)
118      */
119     enum int MIN_RADIX = 2;
120 
121     /**
122      * The maximum radix available for conversion to and from strings.
123      * The constant value of this field is the largest value permitted
124      * for the radix argument in radix-conversion methods such as the
125      * {@code digit} method, the {@code forDigit} method, and the
126      * {@code toString} method of class {@code Integer}.
127      *
128      * @see     Character#digit(char, int)
129      * @see     Character#forDigit(int, int)
130      * @see     Integer#toString(int, int)
131      * @see     Integer#valueOf(string)
132      */
133     enum int MAX_RADIX = 36;
134 
135     /**
136      * The constant value of this field is the smallest value of type
137      * {@code char}, {@code '\u005Cu0000'}.
138      *
139      * @since   1.0.2
140      */
141     enum char MIN_VALUE = '\u0000';
142 
143     /**
144      * The constant value of this field is the largest value of type
145      * {@code char}, {@code '\u005CuFFFF'}.
146      *
147      * @since   1.0.2
148      */
149     // enum char MAX_VALUE = '\uFFFF';
150 
151     /**
152      * The {@code Class} instance representing the primitive type
153      * {@code char}.
154      *
155      * @since   1.1
156      */
157     // 
158     // enum Class<Character> TYPE = (Class<Character>) Class.getPrimitiveClass("char");
159 
160     /*
161      * Normative general types
162      */
163 
164     /*
165      * General character types
166      */
167 
168     /**
169      * General category "Cn" in the Unicode specification.
170      * @since   1.1
171      */
172     enum byte UNASSIGNED = 0;
173 
174     /**
175      * General category "Lu" in the Unicode specification.
176      * @since   1.1
177      */
178     enum byte UPPERCASE_LETTER = 1;
179 
180     /**
181      * General category "Ll" in the Unicode specification.
182      * @since   1.1
183      */
184     enum byte LOWERCASE_LETTER = 2;
185 
186     /**
187      * General category "Lt" in the Unicode specification.
188      * @since   1.1
189      */
190     enum byte TITLECASE_LETTER = 3;
191 
192     /**
193      * General category "Lm" in the Unicode specification.
194      * @since   1.1
195      */
196     enum byte MODIFIER_LETTER = 4;
197 
198     /**
199      * General category "Lo" in the Unicode specification.
200      * @since   1.1
201      */
202     enum byte OTHER_LETTER = 5;
203 
204     /**
205      * General category "Mn" in the Unicode specification.
206      * @since   1.1
207      */
208     enum byte NON_SPACING_MARK = 6;
209 
210     /**
211      * General category "Me" in the Unicode specification.
212      * @since   1.1
213      */
214     enum byte ENCLOSING_MARK = 7;
215 
216     /**
217      * General category "Mc" in the Unicode specification.
218      * @since   1.1
219      */
220     enum byte COMBINING_SPACING_MARK = 8;
221 
222     /**
223      * General category "Nd" in the Unicode specification.
224      * @since   1.1
225      */
226     enum byte DECIMAL_DIGIT_NUMBER = 9;
227 
228     /**
229      * General category "Nl" in the Unicode specification.
230      * @since   1.1
231      */
232     enum byte LETTER_NUMBER = 10;
233 
234     /**
235      * General category "No" in the Unicode specification.
236      * @since   1.1
237      */
238     enum byte OTHER_NUMBER = 11;
239 
240     /**
241      * General category "Zs" in the Unicode specification.
242      * @since   1.1
243      */
244     enum byte SPACE_SEPARATOR = 12;
245 
246     /**
247      * General category "Zl" in the Unicode specification.
248      * @since   1.1
249      */
250     enum byte LINE_SEPARATOR = 13;
251 
252     /**
253      * General category "Zp" in the Unicode specification.
254      * @since   1.1
255      */
256     enum byte PARAGRAPH_SEPARATOR = 14;
257 
258     /**
259      * General category "Cc" in the Unicode specification.
260      * @since   1.1
261      */
262     enum byte CONTROL = 15;
263 
264     /**
265      * General category "Cf" in the Unicode specification.
266      * @since   1.1
267      */
268     enum byte FORMAT = 16;
269 
270     /**
271      * General category "Co" in the Unicode specification.
272      * @since   1.1
273      */
274     enum byte PRIVATE_USE = 18;
275 
276     /**
277      * General category "Cs" in the Unicode specification.
278      * @since   1.1
279      */
280     enum byte SURROGATE = 19;
281 
282     /**
283      * General category "Pd" in the Unicode specification.
284      * @since   1.1
285      */
286     enum byte DASH_PUNCTUATION = 20;
287 
288     /**
289      * General category "Ps" in the Unicode specification.
290      * @since   1.1
291      */
292     enum byte START_PUNCTUATION = 21;
293 
294     /**
295      * General category "Pe" in the Unicode specification.
296      * @since   1.1
297      */
298     enum byte END_PUNCTUATION = 22;
299 
300     /**
301      * General category "Pc" in the Unicode specification.
302      * @since   1.1
303      */
304     enum byte CONNECTOR_PUNCTUATION = 23;
305 
306     /**
307      * General category "Po" in the Unicode specification.
308      * @since   1.1
309      */
310     enum byte OTHER_PUNCTUATION = 24;
311 
312     /**
313      * General category "Sm" in the Unicode specification.
314      * @since   1.1
315      */
316     enum byte MATH_SYMBOL = 25;
317 
318     /**
319      * General category "Sc" in the Unicode specification.
320      * @since   1.1
321      */
322     enum byte CURRENCY_SYMBOL = 26;
323 
324     /**
325      * General category "Sk" in the Unicode specification.
326      * @since   1.1
327      */
328     enum byte MODIFIER_SYMBOL = 27;
329 
330     /**
331      * General category "So" in the Unicode specification.
332      * @since   1.1
333      */
334     enum byte OTHER_SYMBOL = 28;
335 
336     /**
337      * General category "Pi" in the Unicode specification.
338      * @since   1.4
339      */
340     enum byte INITIAL_QUOTE_PUNCTUATION = 29;
341 
342     /**
343      * General category "Pf" in the Unicode specification.
344      * @since   1.4
345      */
346     enum byte FINAL_QUOTE_PUNCTUATION = 30;
347 
348     /**
349      * Error flag. Use int (code point) to avoid confusion with U+FFFF.
350      */
351     enum int ERROR = 0xFFFFFFFF;
352 
353     /**
354      * Undefined bidirectional character type. Undefined {@code char}
355      * values have undefined directionality in the Unicode specification.
356      * @since 1.4
357      */
358     enum byte DIRECTIONALITY_UNDEFINED = -1;
359 
360     /**
361      * Strong bidirectional character type "L" in the Unicode specification.
362      * @since 1.4
363      */
364     enum byte DIRECTIONALITY_LEFT_TO_RIGHT = 0;
365 
366     /**
367      * Strong bidirectional character type "R" in the Unicode specification.
368      * @since 1.4
369      */
370     enum byte DIRECTIONALITY_RIGHT_TO_LEFT = 1;
371 
372     /**
373     * Strong bidirectional character type "AL" in the Unicode specification.
374      * @since 1.4
375      */
376     enum byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2;
377 
378     /**
379      * Weak bidirectional character type "EN" in the Unicode specification.
380      * @since 1.4
381      */
382     enum byte DIRECTIONALITY_EUROPEAN_NUMBER = 3;
383 
384     /**
385      * Weak bidirectional character type "ES" in the Unicode specification.
386      * @since 1.4
387      */
388     enum byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4;
389 
390     /**
391      * Weak bidirectional character type "ET" in the Unicode specification.
392      * @since 1.4
393      */
394     enum byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5;
395 
396     /**
397      * Weak bidirectional character type "AN" in the Unicode specification.
398      * @since 1.4
399      */
400     enum byte DIRECTIONALITY_ARABIC_NUMBER = 6;
401 
402     /**
403      * Weak bidirectional character type "CS" in the Unicode specification.
404      * @since 1.4
405      */
406     enum byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7;
407 
408     /**
409      * Weak bidirectional character type "NSM" in the Unicode specification.
410      * @since 1.4
411      */
412     enum byte DIRECTIONALITY_NONSPACING_MARK = 8;
413 
414     /**
415      * Weak bidirectional character type "BN" in the Unicode specification.
416      * @since 1.4
417      */
418     enum byte DIRECTIONALITY_BOUNDARY_NEUTRAL = 9;
419 
420     /**
421      * Neutral bidirectional character type "B" in the Unicode specification.
422      * @since 1.4
423      */
424     enum byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10;
425 
426     /**
427      * Neutral bidirectional character type "S" in the Unicode specification.
428      * @since 1.4
429      */
430     enum byte DIRECTIONALITY_SEGMENT_SEPARATOR = 11;
431 
432     /**
433      * Neutral bidirectional character type "WS" in the Unicode specification.
434      * @since 1.4
435      */
436     enum byte DIRECTIONALITY_WHITESPACE = 12;
437 
438     /**
439      * Neutral bidirectional character type "ON" in the Unicode specification.
440      * @since 1.4
441      */
442     enum byte DIRECTIONALITY_OTHER_NEUTRALS = 13;
443 
444     /**
445      * Strong bidirectional character type "LRE" in the Unicode specification.
446      * @since 1.4
447      */
448     enum byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14;
449 
450     /**
451      * Strong bidirectional character type "LRO" in the Unicode specification.
452      * @since 1.4
453      */
454     enum byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15;
455 
456     /**
457      * Strong bidirectional character type "RLE" in the Unicode specification.
458      * @since 1.4
459      */
460     enum byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16;
461 
462     /**
463      * Strong bidirectional character type "RLO" in the Unicode specification.
464      * @since 1.4
465      */
466     enum byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17;
467 
468     /**
469      * Weak bidirectional character type "PDF" in the Unicode specification.
470      * @since 1.4
471      */
472     enum byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18;
473 
474     /**
475      * The minimum value of a
476      * <a href="http://www.unicode.org/glossary/#high_surrogate_code_unit">
477      * Unicode high-surrogate code unit</a>
478      * in the UTF-16 encoding, constant {@code '\u005CuD800'}.
479      * A high-surrogate is also known as a <i>leading-surrogate</i>.
480      *
481      * @since 1.5
482      */
483     enum wchar MIN_HIGH_SURROGATE = 0xD800;
484 
485     /**
486      * The maximum value of a
487      * <a href="http://www.unicode.org/glossary/#high_surrogate_code_unit">
488      * Unicode high-surrogate code unit</a>
489      * in the UTF-16 encoding, constant {@code '\u005CuDBFF'}.
490      * A high-surrogate is also known as a <i>leading-surrogate</i>.
491      *
492      * @since 1.5
493      */
494     enum wchar MAX_HIGH_SURROGATE = 0xDBFF;
495 
496     /**
497      * The minimum value of a
498      * <a href="http://www.unicode.org/glossary/#low_surrogate_code_unit">
499      * Unicode low-surrogate code unit</a>
500      * in the UTF-16 encoding, constant {@code '\u005CuDC00'}.
501      * A low-surrogate is also known as a <i>trailing-surrogate</i>.
502      *
503      * @since 1.5
504      */
505     enum wchar MIN_LOW_SURROGATE  = 0xDC00;
506 
507     /**
508      * The maximum value of a
509      * <a href="http://www.unicode.org/glossary/#low_surrogate_code_unit">
510      * Unicode low-surrogate code unit</a>
511      * in the UTF-16 encoding, constant {@code '\u005CuDFFF'}.
512      * A low-surrogate is also known as a <i>trailing-surrogate</i>.
513      *
514      * @since 1.5
515      */
516     enum wchar MAX_LOW_SURROGATE  = 0xDFFF;
517 
518     /**
519      * The minimum value of a Unicode surrogate code unit in the
520      * UTF-16 encoding, constant {@code '\u005CuD800'}.
521      *
522      * @since 1.5
523      */
524     enum wchar MIN_SURROGATE = MIN_HIGH_SURROGATE;
525 
526     /**
527      * The maximum value of a Unicode surrogate code unit in the
528      * UTF-16 encoding, constant {@code '\u005CuDFFF'}.
529      *
530      * @since 1.5
531      */
532     enum wchar MAX_SURROGATE = MAX_LOW_SURROGATE;
533 
534     /**
535      * The maximum value of a Unicode surrogate code unit in the
536      * UTF-16 encoding, constant {@code '\u005CuDFFF'}.
537      *
538      * @since 1.5
539      */
540     // enum wchar MAX_SURROGATE = MAX_LOW_SURROGATE;
541 
542     /**
543      * The minimum value of a
544      * <a href="http://www.unicode.org/glossary/#supplementary_code_point">
545      * Unicode supplementary code point</a>, constant {@code U+10000}.
546      *
547      * @since 1.5
548      */
549     enum int MIN_SUPPLEMENTARY_CODE_POINT = 0x010000;
550 
551     /**
552      * The minimum value of a
553      * <a href="http://www.unicode.org/glossary/#code_point">
554      * Unicode code point</a>, constant {@code U+0000}.
555      *
556      * @since 1.5
557      */
558     enum int MIN_CODE_POINT = 0x000000;
559 
560     /**
561      * The maximum value of a
562      * <a href="http://www.unicode.org/glossary/#code_point">
563      * Unicode code point</a>, constant {@code U+10FFFF}.
564      *
565      * @since 1.5
566      */
567     enum int MAX_CODE_POINT = 0X10FFFF;
568 
569     this(char value) {
570         super(value);
571     }
572 
573     /**
574      * Returns a {@code Character} instance representing the specified
575      * {@code char} value.
576      * If a new {@code Character} instance is not required, this method
577      * should generally be used in preference to the constructor
578      * {@link #Character(char)}, as this method is likely to yield
579      * significantly better space and time performance by caching
580      * frequently requested values.
581      *
582      * This method will always cache values in the range {@code
583      * '\u005Cu0000'} to {@code '\u005Cu007F'}, inclusive, and may
584      * cache other values outside of this range.
585      *
586      * @param  c a char value.
587      * @return a {@code Character} instance representing {@code c}.
588      * @since  1.5
589      */
590     static Char valueOf(char c) {
591         if (c <= 127) { // must cache
592             return CharacterCache.cache[cast(int)c];
593         }
594         return new Char(c);
595     }
596 
597     /**
598      * Returns the value of this {@code Character} object.
599      * @return  the primitive {@code char} value represented by
600      *          this object.
601      */
602     char charValue() {
603         return _value;
604     }
605 
606     override size_t toHash() @trusted nothrow {
607         return _value;
608     }
609 
610     /**
611      * Determines the number of {@code char} values needed to
612      * represent the specified character (Unicode code point). If the
613      * specified character is equal to or greater than 0x10000, then
614      * the method returns 2. Otherwise, the method returns 1.
615      *
616      * <p>This method doesn't validate the specified character to be a
617      * valid Unicode code point. The caller must validate the
618      * character value using {@link #isValidCodePoint(int) isValidCodePoint}
619      * if necessary.
620      *
621      * @param   codePoint the character (Unicode code point) to be tested.
622      * @return  2 if the character is a valid supplementary character; 1 otherwise.
623      * @see     Character#isSupplementaryCodePoint(int)
624      * @since   1.5
625      */
626     static int charCount(int codePoint) {
627         return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT ? 2 : 1;
628     }
629 
630     /**
631      * Converts the specified surrogate pair to its supplementary code
632      * point value. This method does not validate the specified
633      * surrogate pair. The caller must validate it using {@link
634      * #isSurrogatePair(char, char) isSurrogatePair} if necessary.
635      *
636      * @param  high the high-surrogate code unit
637      * @param  low the low-surrogate code unit
638      * @return the supplementary code point composed from the
639      *         specified surrogate pair.
640      * @since  1.5
641      */
642     static int toCodePoint(char high, char low) {
643         // Optimized form of:
644         // return ((high - MIN_HIGH_SURROGATE) << 10)
645         //         + (low - MIN_LOW_SURROGATE)
646         //         + MIN_SUPPLEMENTARY_CODE_POINT;
647         return ((high << 10) + low) + (MIN_SUPPLEMENTARY_CODE_POINT
648                                        - (MIN_HIGH_SURROGATE << 10)
649                                        - MIN_LOW_SURROGATE);
650     }
651 
652      /**
653      * Determines if the specified character is an ISO control
654      * character.  A character is considered to be an ISO control
655      * character if its code is in the range {@code '\u005Cu0000'}
656      * through {@code '\u005Cu001F'} or in the range
657      * {@code '\u005Cu007F'} through {@code '\u005Cu009F'}.
658      *
659      * <p><b>Note:</b> This method cannot handle <a
660      * href="#supplementary"> supplementary characters</a>. To support
661      * all Unicode characters, including supplementary characters, use
662      * the {@link #isISOControl(int)} method.
663      *
664      * @param   ch      the character to be tested.
665      * @return  {@code true} if the character is an ISO control character;
666      *          {@code false} otherwise.
667      *
668      * @see     Character#isSpaceChar(char)
669      * @see     Character#isWhitespace(char)
670      * @since   1.1
671      */
672     public static bool isISOControl(char ch) {
673         return isISOControl(cast(int)ch);
674     }
675 
676     /**
677      * Determines if the referenced character (Unicode code point) is an ISO control
678      * character.  A character is considered to be an ISO control
679      * character if its code is in the range {@code '\u005Cu0000'}
680      * through {@code '\u005Cu001F'} or in the range
681      * {@code '\u005Cu007F'} through {@code '\u005Cu009F'}.
682      *
683      * @param   codePoint the character (Unicode code point) to be tested.
684      * @return  {@code true} if the character is an ISO control character;
685      *          {@code false} otherwise.
686      * @see     Character#isSpaceChar(int)
687      * @see     Character#isWhitespace(int)
688      * @since   1.5
689      */
690     public static bool isISOControl(int codePoint) {
691         // Optimized form of:
692         //     (codePoint >= 0x00 && codePoint <= 0x1F) ||
693         //     (codePoint >= 0x7F && codePoint <= 0x9F);
694         return codePoint <= 0x9F &&
695             (codePoint >= 0x7F || (codePoint >>> 5 == 0));
696     }
697 
698     /**
699      * Converts the specified character (Unicode code point) to its
700      * UTF-16 representation stored in a {@code char} array. If
701      * the specified code point is a BMP (Basic Multilingual Plane or
702      * Plane 0) value, the resulting {@code char} array has
703      * the same value as {@code codePoint}. If the specified code
704      * point is a supplementary code point, the resulting
705      * {@code char} array has the corresponding surrogate pair.
706      *
707      * @param  codePoint a Unicode code point
708      * @return a {@code char} array having
709      *         {@code codePoint}'s UTF-16 representation.
710      * @throws IllegalArgumentException if the specified
711      * {@code codePoint} is not a valid Unicode code point.
712      * @since  1.5
713      */
714     public static char[] toChars(int codePoint) {
715         if (isBmpCodePoint(codePoint)) {
716             return [ cast(char) codePoint ];
717         } else if (isValidCodePoint(codePoint)) {
718             char[] result = new char[2];
719             toSurrogates(codePoint, result, 0);
720             return result;
721         } else {
722             import std.string;
723             throw new IllegalArgumentException(
724                 format("Not a valid Unicode code point: 0x%X", codePoint));
725         }
726     }
727 
728     /**
729      * Determines whether the specified character (Unicode code point)
730      * is in the <a href="#BMP">Basic Multilingual Plane (BMP)</a>.
731      * Such code points can be represented using a single {@code char}.
732      *
733      * @param  codePoint the character (Unicode code point) to be tested
734      * @return {@code true} if the specified code point is between
735      *         {@link #MIN_VALUE} and {@link #MAX_VALUE} inclusive;
736      *         {@code false} otherwise.
737      * @since  1.7
738      */
739     public static bool isBmpCodePoint(int codePoint) {
740         return codePoint >>> 16 == 0;
741         // Optimized form of:
742         //     codePoint >= MIN_VALUE && codePoint <= MAX_VALUE
743         // We consistently use logical shift (>>>) to facilitate
744         // additional runtime optimizations.
745     }
746 
747     /**
748      * Determines whether the specified code point is a valid
749      * <a href="http://www.unicode.org/glossary/#code_point">
750      * Unicode code point value</a>.
751      *
752      * @param  codePoint the Unicode code point to be tested
753      * @return {@code true} if the specified code point value is between
754      *         {@link #MIN_CODE_POINT} and
755      *         {@link #MAX_CODE_POINT} inclusive;
756      *         {@code false} otherwise.
757      * @since  1.5
758      */
759     public static bool isValidCodePoint(int codePoint) {
760         // Optimized form of:
761         //     codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT
762         int plane = codePoint >>> 16;
763         return plane < ((MAX_CODE_POINT + 1) >>> 16);
764     }
765 
766     static void toSurrogates(int codePoint, char[] dst, int index) {
767         // We write elements "backwards" to guarantee all-or-nothing
768         dst[index+1] = lowSurrogate(codePoint);
769         dst[index] = highSurrogate(codePoint);
770     }
771 
772     /**
773      * Returns the trailing surrogate (a
774      * <a href="http://www.unicode.org/glossary/#low_surrogate_code_unit">
775      * low surrogate code unit</a>) of the
776      * <a href="http://www.unicode.org/glossary/#surrogate_pair">
777      * surrogate pair</a>
778      * representing the specified supplementary character (Unicode
779      * code point) in the UTF-16 encoding.  If the specified character
780      * is not a
781      * <a href="Character.html#supplementary">supplementary character</a>,
782      * an unspecified {@code char} is returned.
783      *
784      * <p>If
785      * {@link #isSupplementaryCodePoint isSupplementaryCodePoint(x)}
786      * is {@code true}, then
787      * {@link #isLowSurrogate isLowSurrogate}{@code (lowSurrogate(x))} and
788      * {@link #toCodePoint toCodePoint}{@code (}{@link #highSurrogate highSurrogate}{@code (x), lowSurrogate(x)) == x}
789      * are also always {@code true}.
790      *
791      * @param   codePoint a supplementary character (Unicode code point)
792      * @return  the trailing surrogate code unit used to represent the
793      *          character in the UTF-16 encoding
794      * @since   1.7
795      */
796     public static char lowSurrogate(int codePoint) {
797         return cast(char) ((codePoint & 0x3ff) + MIN_LOW_SURROGATE);
798     }
799 
800 
801     /**
802      * Returns the leading surrogate (a
803      * <a href="http://www.unicode.org/glossary/#high_surrogate_code_unit">
804      * high surrogate code unit</a>) of the
805      * <a href="http://www.unicode.org/glossary/#surrogate_pair">
806      * surrogate pair</a>
807      * representing the specified supplementary character (Unicode
808      * code point) in the UTF-16 encoding.  If the specified character
809      * is not a
810      * <a href="Character.html#supplementary">supplementary character</a>,
811      * an unspecified {@code char} is returned.
812      *
813      * <p>If
814      * {@link #isSupplementaryCodePoint isSupplementaryCodePoint(x)}
815      * is {@code true}, then
816      * {@link #isHighSurrogate isHighSurrogate}{@code (highSurrogate(x))} and
817      * {@link #toCodePoint toCodePoint}{@code (highSurrogate(x), }{@link #lowSurrogate lowSurrogate}{@code (x)) == x}
818      * are also always {@code true}.
819      *
820      * @param   codePoint a supplementary character (Unicode code point)
821      * @return  the leading surrogate code unit used to represent the
822      *          character in the UTF-16 encoding
823      * @since   1.7
824      */
825     public static char highSurrogate(int codePoint) {
826         return cast(char) ((codePoint >>> 10)
827             + (MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10)));
828     }
829 
830     /**
831      * Returns the code point at the given index of the
832      * {@code CharSequence}. If the {@code char} value at
833      * the given index in the {@code CharSequence} is in the
834      * high-surrogate range, the following index is less than the
835      * length of the {@code CharSequence}, and the
836      * {@code char} value at the following index is in the
837      * low-surrogate range, then the supplementary code point
838      * corresponding to this surrogate pair is returned. Otherwise,
839      * the {@code char} value at the given index is returned.
840      *
841      * @param seq a sequence of {@code char} values (Unicode code
842      * units)
843      * @param index the index to the {@code char} values (Unicode
844      * code units) in {@code seq} to be converted
845      * @return the Unicode code point at the given index
846      * @throws NullPointerException if {@code seq} is null.
847      * @throws IndexOutOfBoundsException if the value
848      * {@code index} is negative or not less than
849      * {@link CharSequence#length() seq.length()}.
850      * @since  1.5
851      */
852     public static int codePointAt(string seq, int index) {
853         char c1 = seq.charAt(index);
854         if (isHighSurrogate(c1) && ++index < seq.length) {
855             char c2 = seq.charAt(index);
856             if (isLowSurrogate(c2)) {
857                 return toCodePoint(c1, c2);
858             }
859         }
860         return c1;
861     }
862 
863     /**
864      * Determines if the given {@code char} value is a
865      * <a href="http://www.unicode.org/glossary/#high_surrogate_code_unit">
866      * Unicode high-surrogate code unit</a>
867      * (also known as <i>leading-surrogate code unit</i>).
868      *
869      * <p>Such values do not represent characters by themselves,
870      * but are used in the representation of
871      * <a href="#supplementary">supplementary characters</a>
872      * in the UTF-16 encoding.
873      *
874      * @param  ch the {@code char} value to be tested.
875      * @return {@code true} if the {@code char} value is between
876      *         {@link #MIN_HIGH_SURROGATE} and
877      *         {@link #MAX_HIGH_SURROGATE} inclusive;
878      *         {@code false} otherwise.
879      * @see    Character#isLowSurrogate(char)
880      * @see    Character.UnicodeBlock#of(int)
881      * @since  1.5
882      */
883     public static bool isHighSurrogate(char ch) {
884         // Help VM constant-fold; MAX_HIGH_SURROGATE + 1 == MIN_LOW_SURROGATE
885         return ch >= MIN_HIGH_SURROGATE && ch < (MAX_HIGH_SURROGATE + 1);
886     }
887 
888     /**
889      * Determines if the given {@code char} value is a
890      * <a href="http://www.unicode.org/glossary/#low_surrogate_code_unit">
891      * Unicode low-surrogate code unit</a>
892      * (also known as <i>trailing-surrogate code unit</i>).
893      *
894      * <p>Such values do not represent characters by themselves,
895      * but are used in the representation of
896      * <a href="#supplementary">supplementary characters</a>
897      * in the UTF-16 encoding.
898      *
899      * @param  ch the {@code char} value to be tested.
900      * @return {@code true} if the {@code char} value is between
901      *         {@link #MIN_LOW_SURROGATE} and
902      *         {@link #MAX_LOW_SURROGATE} inclusive;
903      *         {@code false} otherwise.
904      * @see    Character#isHighSurrogate(char)
905      * @since  1.5
906      */
907     public static bool isLowSurrogate(char ch) {
908         return ch >= MIN_LOW_SURROGATE && ch < (MAX_LOW_SURROGATE + 1);
909     }
910 
911      /**
912      * Determines if the specified character is a letter.
913      * <p>
914      * A character is considered to be a letter if its general
915      * category type, provided by {@code Character.getType(ch)},
916      * is any of the following:
917      * <ul>
918      * <li> {@code UPPERCASE_LETTER}
919      * <li> {@code LOWERCASE_LETTER}
920      * <li> {@code TITLECASE_LETTER}
921      * <li> {@code MODIFIER_LETTER}
922      * <li> {@code OTHER_LETTER}
923      * </ul>
924      *
925      * Not all letters have case. Many characters are
926      * letters but are neither uppercase nor lowercase nor titlecase.
927      *
928      * <p><b>Note:</b> This method cannot handle <a
929      * href="#supplementary"> supplementary characters</a>. To support
930      * all Unicode characters, including supplementary characters, use
931      * the {@link #isLetter(int)} method.
932      *
933      * @param   ch   the character to be tested.
934      * @return  {@code true} if the character is a letter;
935      *          {@code false} otherwise.
936      * @see     Character#isDigit(char)
937      * @see     Character#isJavaIdentifierStart(char)
938      * @see     Character#isJavaLetter(char)
939      * @see     Character#isJavaLetterOrDigit(char)
940      * @see     Character#isLetterOrDigit(char)
941      * @see     Character#isLowerCase(char)
942      * @see     Character#isTitleCase(char)
943      * @see     Character#isUnicodeIdentifierStart(char)
944      * @see     Character#isUpperCase(char)
945      */
946     public static bool isLetter(char ch) {
947         return isLetter(cast(int)ch);
948     }
949 
950     /**
951      * Determines if the specified character (Unicode code point) is a letter.
952      * <p>
953      * A character is considered to be a letter if its general
954      * category type, provided by {@link Character#getType(int) getType(codePoint)},
955      * is any of the following:
956      * <ul>
957      * <li> {@code UPPERCASE_LETTER}
958      * <li> {@code LOWERCASE_LETTER}
959      * <li> {@code TITLECASE_LETTER}
960      * <li> {@code MODIFIER_LETTER}
961      * <li> {@code OTHER_LETTER}
962      * </ul>
963      *
964      * Not all letters have case. Many characters are
965      * letters but are neither uppercase nor lowercase nor titlecase.
966      *
967      * @param   codePoint the character (Unicode code point) to be tested.
968      * @return  {@code true} if the character is a letter;
969      *          {@code false} otherwise.
970      * @see     Character#isDigit(int)
971      * @see     Character#isJavaIdentifierStart(int)
972      * @see     Character#isLetterOrDigit(int)
973      * @see     Character#isLowerCase(int)
974      * @see     Character#isTitleCase(int)
975      * @see     Character#isUnicodeIdentifierStart(int)
976      * @see     Character#isUpperCase(int)
977      * @since   1.5
978      */
979     // public static bool isLetter(int codePoint) {
980     //     return ((((1 << Char.UPPERCASE_LETTER) |
981     //         (1 << Char.LOWERCASE_LETTER) |
982     //         (1 << Char.TITLECASE_LETTER) |
983     //         (1 << Char.MODIFIER_LETTER) |
984     //         (1 << Char.OTHER_LETTER)) >> getType(codePoint)) & 1)
985     //         != 0;
986     // }
987 }
988 
989 private class CharacterCache {
990     private this() {
991     }
992 
993     __gshared Char[] cache;
994 
995     shared static this() {
996         cache = new Char[127 + 1];
997         for (int i = 0; i < cast(int)cache.length; i++) {
998             cache[i] = new Char(cast(char) i);
999         }
1000     }
1001 }