1 /* 2 * Hunt - A refined core library for D programming language. 3 * 4 * Copyright (C) 2018-2019 HuntLabs 5 * 6 * Website: https://www.huntlabs.net/ 7 * 8 * Licensed under the Apache-2.0 License. 9 * 10 */ 11 12 module hunt.Char; 13 14 import hunt.Nullable; 15 import hunt.Exceptions; 16 import hunt.text.Common; 17 /** 18 * The {@code Character} class wraps a value of the primitive 19 * type {@code char} in an object. An object of type 20 * {@code Character} contains a single field whose type is 21 * {@code char}. 22 * <p> 23 * In addition, this class provides several methods for determining 24 * a character's category (lowercase letter, digit, etc.) and for converting 25 * characters from uppercase to lowercase and vice versa. 26 * <p> 27 * Character information is based on the Unicode Standard, version 8.0.0. 28 * <p> 29 * The methods and data of class {@code Character} are defined by 30 * the information in the <i>UnicodeData</i> file that is part of the 31 * Unicode Character Database maintained by the Unicode 32 * Consortium. This file specifies various properties including name 33 * and general category for every defined Unicode code point or 34 * character range. 35 * <p> 36 * The file and its description are available from the Unicode Consortium at: 37 * <ul> 38 * <li><a href="http://www.unicode.org">http://www.unicode.org</a> 39 * </ul> 40 * 41 * <h3><a id="unicode">Unicode Character Representations</a></h3> 42 * 43 * <p>The {@code char} data type (and therefore the value that a 44 * {@code Character} object encapsulates) are based on the 45 * original Unicode specification, which defined characters as 46 * fixed-width 16-bit entities. The Unicode Standard has since been 47 * changed to allow for characters whose representation requires more 48 * than 16 bits. The range of legal <em>code point</em>s is now 49 * U+0000 to U+10FFFF, known as <em>Unicode scalar value</em>. 50 * (Refer to the <a 51 * href="http://www.unicode.org/reports/tr27/#notation"><i> 52 * definition</i></a> of the U+<i>n</i> notation in the Unicode 53 * Standard.) 54 * 55 * <p><a id="BMP">The set of characters from U+0000 to U+FFFF</a> is 56 * sometimes referred to as the <em>Basic Multilingual Plane (BMP)</em>. 57 * <a id="supplementary">Characters</a> whose code points are greater 58 * than U+FFFF are called <em>supplementary character</em>s. The Java 59 * platform uses the UTF-16 representation in {@code char} arrays and 60 * in the {@code string} and {@code StringBuffer} classes. In 61 * this representation, supplementary characters are represented as a pair 62 * of {@code char} values, the first from the <em>high-surrogates</em> 63 * range, (\uD800-\uDBFF), the second from the 64 * <em>low-surrogates</em> range (\uDC00-\uDFFF). 65 * 66 * <p>A {@code char} value, therefore, represents Basic 67 * Multilingual Plane (BMP) code points, including the surrogate 68 * code points, or code units of the UTF-16 encoding. An 69 * {@code int} value represents all Unicode code points, 70 * including supplementary code points. The lower (least significant) 71 * 21 bits of {@code int} are used to represent Unicode code 72 * points and the upper (most significant) 11 bits must be zero. 73 * Unless otherwise specified, the behavior with respect to 74 * supplementary characters and surrogate {@code char} values is 75 * as follows: 76 * 77 * <ul> 78 * <li>The methods that only accept a {@code char} value cannot support 79 * supplementary characters. They treat {@code char} values from the 80 * surrogate ranges as undefined characters. For example, 81 * {@code Character.isLetter('\u005CuD840')} returns {@code false}, even though 82 * this specific value if followed by any low-surrogate value in a string 83 * would represent a letter. 84 * 85 * <li>The methods that accept an {@code int} value support all 86 * Unicode characters, including supplementary characters. For 87 * example, {@code Character.isLetter(0x2F81A)} returns 88 * {@code true} because the code point value represents a letter 89 * (a CJK ideograph). 90 * </ul> 91 * 92 * <p>In the Java SE API documentation, <em>Unicode code point</em> is 93 * used for character values in the range between U+0000 and U+10FFFF, 94 * and <em>Unicode code unit</em> is used for 16-bit 95 * {@code char} values that are code units of the <em>UTF-16</em> 96 * encoding. For more information on Unicode terminology, refer to the 97 * <a href="http://www.unicode.org/glossary/">Unicode Glossary</a>. 98 * 99 * @author Lee Boynton 100 * @author Guy Steele 101 * @author Akira Tanaka 102 * @author Martin Buchholz 103 * @author Ulf Zibis 104 * @since 1.0 105 */ 106 class Char : Nullable!char { 107 /** 108 * The minimum radix available for conversion to and from strings. 109 * The constant value of this field is the smallest value permitted 110 * for the radix argument in radix-conversion methods such as the 111 * {@code digit} method, the {@code forDigit} method, and the 112 * {@code toString} method of class {@code Integer}. 113 * 114 * @see Character#digit(char, int) 115 * @see Character#forDigit(int, int) 116 * @see Integer#toString(int, int) 117 * @see Integer#valueOf(string) 118 */ 119 enum int MIN_RADIX = 2; 120 121 /** 122 * The maximum radix available for conversion to and from strings. 123 * The constant value of this field is the largest value permitted 124 * for the radix argument in radix-conversion methods such as the 125 * {@code digit} method, the {@code forDigit} method, and the 126 * {@code toString} method of class {@code Integer}. 127 * 128 * @see Character#digit(char, int) 129 * @see Character#forDigit(int, int) 130 * @see Integer#toString(int, int) 131 * @see Integer#valueOf(string) 132 */ 133 enum int MAX_RADIX = 36; 134 135 /** 136 * The constant value of this field is the smallest value of type 137 * {@code char}, {@code '\u005Cu0000'}. 138 * 139 * @since 1.0.2 140 */ 141 enum char MIN_VALUE = '\u0000'; 142 143 /** 144 * The constant value of this field is the largest value of type 145 * {@code char}, {@code '\u005CuFFFF'}. 146 * 147 * @since 1.0.2 148 */ 149 // enum char MAX_VALUE = '\uFFFF'; 150 151 /** 152 * The {@code Class} instance representing the primitive type 153 * {@code char}. 154 * 155 * @since 1.1 156 */ 157 // 158 // enum Class<Character> TYPE = (Class<Character>) Class.getPrimitiveClass("char"); 159 160 /* 161 * Normative general types 162 */ 163 164 /* 165 * General character types 166 */ 167 168 /** 169 * General category "Cn" in the Unicode specification. 170 * @since 1.1 171 */ 172 enum byte UNASSIGNED = 0; 173 174 /** 175 * General category "Lu" in the Unicode specification. 176 * @since 1.1 177 */ 178 enum byte UPPERCASE_LETTER = 1; 179 180 /** 181 * General category "Ll" in the Unicode specification. 182 * @since 1.1 183 */ 184 enum byte LOWERCASE_LETTER = 2; 185 186 /** 187 * General category "Lt" in the Unicode specification. 188 * @since 1.1 189 */ 190 enum byte TITLECASE_LETTER = 3; 191 192 /** 193 * General category "Lm" in the Unicode specification. 194 * @since 1.1 195 */ 196 enum byte MODIFIER_LETTER = 4; 197 198 /** 199 * General category "Lo" in the Unicode specification. 200 * @since 1.1 201 */ 202 enum byte OTHER_LETTER = 5; 203 204 /** 205 * General category "Mn" in the Unicode specification. 206 * @since 1.1 207 */ 208 enum byte NON_SPACING_MARK = 6; 209 210 /** 211 * General category "Me" in the Unicode specification. 212 * @since 1.1 213 */ 214 enum byte ENCLOSING_MARK = 7; 215 216 /** 217 * General category "Mc" in the Unicode specification. 218 * @since 1.1 219 */ 220 enum byte COMBINING_SPACING_MARK = 8; 221 222 /** 223 * General category "Nd" in the Unicode specification. 224 * @since 1.1 225 */ 226 enum byte DECIMAL_DIGIT_NUMBER = 9; 227 228 /** 229 * General category "Nl" in the Unicode specification. 230 * @since 1.1 231 */ 232 enum byte LETTER_NUMBER = 10; 233 234 /** 235 * General category "No" in the Unicode specification. 236 * @since 1.1 237 */ 238 enum byte OTHER_NUMBER = 11; 239 240 /** 241 * General category "Zs" in the Unicode specification. 242 * @since 1.1 243 */ 244 enum byte SPACE_SEPARATOR = 12; 245 246 /** 247 * General category "Zl" in the Unicode specification. 248 * @since 1.1 249 */ 250 enum byte LINE_SEPARATOR = 13; 251 252 /** 253 * General category "Zp" in the Unicode specification. 254 * @since 1.1 255 */ 256 enum byte PARAGRAPH_SEPARATOR = 14; 257 258 /** 259 * General category "Cc" in the Unicode specification. 260 * @since 1.1 261 */ 262 enum byte CONTROL = 15; 263 264 /** 265 * General category "Cf" in the Unicode specification. 266 * @since 1.1 267 */ 268 enum byte FORMAT = 16; 269 270 /** 271 * General category "Co" in the Unicode specification. 272 * @since 1.1 273 */ 274 enum byte PRIVATE_USE = 18; 275 276 /** 277 * General category "Cs" in the Unicode specification. 278 * @since 1.1 279 */ 280 enum byte SURROGATE = 19; 281 282 /** 283 * General category "Pd" in the Unicode specification. 284 * @since 1.1 285 */ 286 enum byte DASH_PUNCTUATION = 20; 287 288 /** 289 * General category "Ps" in the Unicode specification. 290 * @since 1.1 291 */ 292 enum byte START_PUNCTUATION = 21; 293 294 /** 295 * General category "Pe" in the Unicode specification. 296 * @since 1.1 297 */ 298 enum byte END_PUNCTUATION = 22; 299 300 /** 301 * General category "Pc" in the Unicode specification. 302 * @since 1.1 303 */ 304 enum byte CONNECTOR_PUNCTUATION = 23; 305 306 /** 307 * General category "Po" in the Unicode specification. 308 * @since 1.1 309 */ 310 enum byte OTHER_PUNCTUATION = 24; 311 312 /** 313 * General category "Sm" in the Unicode specification. 314 * @since 1.1 315 */ 316 enum byte MATH_SYMBOL = 25; 317 318 /** 319 * General category "Sc" in the Unicode specification. 320 * @since 1.1 321 */ 322 enum byte CURRENCY_SYMBOL = 26; 323 324 /** 325 * General category "Sk" in the Unicode specification. 326 * @since 1.1 327 */ 328 enum byte MODIFIER_SYMBOL = 27; 329 330 /** 331 * General category "So" in the Unicode specification. 332 * @since 1.1 333 */ 334 enum byte OTHER_SYMBOL = 28; 335 336 /** 337 * General category "Pi" in the Unicode specification. 338 * @since 1.4 339 */ 340 enum byte INITIAL_QUOTE_PUNCTUATION = 29; 341 342 /** 343 * General category "Pf" in the Unicode specification. 344 * @since 1.4 345 */ 346 enum byte FINAL_QUOTE_PUNCTUATION = 30; 347 348 /** 349 * Error flag. Use int (code point) to avoid confusion with U+FFFF. 350 */ 351 enum int ERROR = 0xFFFFFFFF; 352 353 /** 354 * Undefined bidirectional character type. Undefined {@code char} 355 * values have undefined directionality in the Unicode specification. 356 * @since 1.4 357 */ 358 enum byte DIRECTIONALITY_UNDEFINED = -1; 359 360 /** 361 * Strong bidirectional character type "L" in the Unicode specification. 362 * @since 1.4 363 */ 364 enum byte DIRECTIONALITY_LEFT_TO_RIGHT = 0; 365 366 /** 367 * Strong bidirectional character type "R" in the Unicode specification. 368 * @since 1.4 369 */ 370 enum byte DIRECTIONALITY_RIGHT_TO_LEFT = 1; 371 372 /** 373 * Strong bidirectional character type "AL" in the Unicode specification. 374 * @since 1.4 375 */ 376 enum byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2; 377 378 /** 379 * Weak bidirectional character type "EN" in the Unicode specification. 380 * @since 1.4 381 */ 382 enum byte DIRECTIONALITY_EUROPEAN_NUMBER = 3; 383 384 /** 385 * Weak bidirectional character type "ES" in the Unicode specification. 386 * @since 1.4 387 */ 388 enum byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4; 389 390 /** 391 * Weak bidirectional character type "ET" in the Unicode specification. 392 * @since 1.4 393 */ 394 enum byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5; 395 396 /** 397 * Weak bidirectional character type "AN" in the Unicode specification. 398 * @since 1.4 399 */ 400 enum byte DIRECTIONALITY_ARABIC_NUMBER = 6; 401 402 /** 403 * Weak bidirectional character type "CS" in the Unicode specification. 404 * @since 1.4 405 */ 406 enum byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7; 407 408 /** 409 * Weak bidirectional character type "NSM" in the Unicode specification. 410 * @since 1.4 411 */ 412 enum byte DIRECTIONALITY_NONSPACING_MARK = 8; 413 414 /** 415 * Weak bidirectional character type "BN" in the Unicode specification. 416 * @since 1.4 417 */ 418 enum byte DIRECTIONALITY_BOUNDARY_NEUTRAL = 9; 419 420 /** 421 * Neutral bidirectional character type "B" in the Unicode specification. 422 * @since 1.4 423 */ 424 enum byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10; 425 426 /** 427 * Neutral bidirectional character type "S" in the Unicode specification. 428 * @since 1.4 429 */ 430 enum byte DIRECTIONALITY_SEGMENT_SEPARATOR = 11; 431 432 /** 433 * Neutral bidirectional character type "WS" in the Unicode specification. 434 * @since 1.4 435 */ 436 enum byte DIRECTIONALITY_WHITESPACE = 12; 437 438 /** 439 * Neutral bidirectional character type "ON" in the Unicode specification. 440 * @since 1.4 441 */ 442 enum byte DIRECTIONALITY_OTHER_NEUTRALS = 13; 443 444 /** 445 * Strong bidirectional character type "LRE" in the Unicode specification. 446 * @since 1.4 447 */ 448 enum byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14; 449 450 /** 451 * Strong bidirectional character type "LRO" in the Unicode specification. 452 * @since 1.4 453 */ 454 enum byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15; 455 456 /** 457 * Strong bidirectional character type "RLE" in the Unicode specification. 458 * @since 1.4 459 */ 460 enum byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16; 461 462 /** 463 * Strong bidirectional character type "RLO" in the Unicode specification. 464 * @since 1.4 465 */ 466 enum byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17; 467 468 /** 469 * Weak bidirectional character type "PDF" in the Unicode specification. 470 * @since 1.4 471 */ 472 enum byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18; 473 474 /** 475 * The minimum value of a 476 * <a href="http://www.unicode.org/glossary/#high_surrogate_code_unit"> 477 * Unicode high-surrogate code unit</a> 478 * in the UTF-16 encoding, constant {@code '\u005CuD800'}. 479 * A high-surrogate is also known as a <i>leading-surrogate</i>. 480 * 481 * @since 1.5 482 */ 483 enum wchar MIN_HIGH_SURROGATE = 0xD800; 484 485 /** 486 * The maximum value of a 487 * <a href="http://www.unicode.org/glossary/#high_surrogate_code_unit"> 488 * Unicode high-surrogate code unit</a> 489 * in the UTF-16 encoding, constant {@code '\u005CuDBFF'}. 490 * A high-surrogate is also known as a <i>leading-surrogate</i>. 491 * 492 * @since 1.5 493 */ 494 enum wchar MAX_HIGH_SURROGATE = 0xDBFF; 495 496 /** 497 * The minimum value of a 498 * <a href="http://www.unicode.org/glossary/#low_surrogate_code_unit"> 499 * Unicode low-surrogate code unit</a> 500 * in the UTF-16 encoding, constant {@code '\u005CuDC00'}. 501 * A low-surrogate is also known as a <i>trailing-surrogate</i>. 502 * 503 * @since 1.5 504 */ 505 enum wchar MIN_LOW_SURROGATE = 0xDC00; 506 507 /** 508 * The maximum value of a 509 * <a href="http://www.unicode.org/glossary/#low_surrogate_code_unit"> 510 * Unicode low-surrogate code unit</a> 511 * in the UTF-16 encoding, constant {@code '\u005CuDFFF'}. 512 * A low-surrogate is also known as a <i>trailing-surrogate</i>. 513 * 514 * @since 1.5 515 */ 516 enum wchar MAX_LOW_SURROGATE = 0xDFFF; 517 518 /** 519 * The minimum value of a Unicode surrogate code unit in the 520 * UTF-16 encoding, constant {@code '\u005CuD800'}. 521 * 522 * @since 1.5 523 */ 524 enum wchar MIN_SURROGATE = MIN_HIGH_SURROGATE; 525 526 /** 527 * The maximum value of a Unicode surrogate code unit in the 528 * UTF-16 encoding, constant {@code '\u005CuDFFF'}. 529 * 530 * @since 1.5 531 */ 532 enum wchar MAX_SURROGATE = MAX_LOW_SURROGATE; 533 534 /** 535 * The maximum value of a Unicode surrogate code unit in the 536 * UTF-16 encoding, constant {@code '\u005CuDFFF'}. 537 * 538 * @since 1.5 539 */ 540 // enum wchar MAX_SURROGATE = MAX_LOW_SURROGATE; 541 542 /** 543 * The minimum value of a 544 * <a href="http://www.unicode.org/glossary/#supplementary_code_point"> 545 * Unicode supplementary code point</a>, constant {@code U+10000}. 546 * 547 * @since 1.5 548 */ 549 enum int MIN_SUPPLEMENTARY_CODE_POINT = 0x010000; 550 551 /** 552 * The minimum value of a 553 * <a href="http://www.unicode.org/glossary/#code_point"> 554 * Unicode code point</a>, constant {@code U+0000}. 555 * 556 * @since 1.5 557 */ 558 enum int MIN_CODE_POINT = 0x000000; 559 560 /** 561 * The maximum value of a 562 * <a href="http://www.unicode.org/glossary/#code_point"> 563 * Unicode code point</a>, constant {@code U+10FFFF}. 564 * 565 * @since 1.5 566 */ 567 enum int MAX_CODE_POINT = 0X10FFFF; 568 569 this(char value) { 570 super(value); 571 } 572 573 /** 574 * Returns a {@code Character} instance representing the specified 575 * {@code char} value. 576 * If a new {@code Character} instance is not required, this method 577 * should generally be used in preference to the constructor 578 * {@link #Character(char)}, as this method is likely to yield 579 * significantly better space and time performance by caching 580 * frequently requested values. 581 * 582 * This method will always cache values in the range {@code 583 * '\u005Cu0000'} to {@code '\u005Cu007F'}, inclusive, and may 584 * cache other values outside of this range. 585 * 586 * @param c a char value. 587 * @return a {@code Character} instance representing {@code c}. 588 * @since 1.5 589 */ 590 static Char valueOf(char c) { 591 if (c <= 127) { // must cache 592 return CharacterCache.cache[cast(int)c]; 593 } 594 return new Char(c); 595 } 596 597 /** 598 * Returns the value of this {@code Character} object. 599 * @return the primitive {@code char} value represented by 600 * this object. 601 */ 602 char charValue() { 603 return _value; 604 } 605 606 override size_t toHash() @trusted nothrow { 607 return _value; 608 } 609 610 /** 611 * Determines the number of {@code char} values needed to 612 * represent the specified character (Unicode code point). If the 613 * specified character is equal to or greater than 0x10000, then 614 * the method returns 2. Otherwise, the method returns 1. 615 * 616 * <p>This method doesn't validate the specified character to be a 617 * valid Unicode code point. The caller must validate the 618 * character value using {@link #isValidCodePoint(int) isValidCodePoint} 619 * if necessary. 620 * 621 * @param codePoint the character (Unicode code point) to be tested. 622 * @return 2 if the character is a valid supplementary character; 1 otherwise. 623 * @see Character#isSupplementaryCodePoint(int) 624 * @since 1.5 625 */ 626 static int charCount(int codePoint) { 627 return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT ? 2 : 1; 628 } 629 630 /** 631 * Converts the specified surrogate pair to its supplementary code 632 * point value. This method does not validate the specified 633 * surrogate pair. The caller must validate it using {@link 634 * #isSurrogatePair(char, char) isSurrogatePair} if necessary. 635 * 636 * @param high the high-surrogate code unit 637 * @param low the low-surrogate code unit 638 * @return the supplementary code point composed from the 639 * specified surrogate pair. 640 * @since 1.5 641 */ 642 static int toCodePoint(char high, char low) { 643 // Optimized form of: 644 // return ((high - MIN_HIGH_SURROGATE) << 10) 645 // + (low - MIN_LOW_SURROGATE) 646 // + MIN_SUPPLEMENTARY_CODE_POINT; 647 return ((high << 10) + low) + (MIN_SUPPLEMENTARY_CODE_POINT 648 - (MIN_HIGH_SURROGATE << 10) 649 - MIN_LOW_SURROGATE); 650 } 651 652 /** 653 * Determines if the specified character is an ISO control 654 * character. A character is considered to be an ISO control 655 * character if its code is in the range {@code '\u005Cu0000'} 656 * through {@code '\u005Cu001F'} or in the range 657 * {@code '\u005Cu007F'} through {@code '\u005Cu009F'}. 658 * 659 * <p><b>Note:</b> This method cannot handle <a 660 * href="#supplementary"> supplementary characters</a>. To support 661 * all Unicode characters, including supplementary characters, use 662 * the {@link #isISOControl(int)} method. 663 * 664 * @param ch the character to be tested. 665 * @return {@code true} if the character is an ISO control character; 666 * {@code false} otherwise. 667 * 668 * @see Character#isSpaceChar(char) 669 * @see Character#isWhitespace(char) 670 * @since 1.1 671 */ 672 public static bool isISOControl(char ch) { 673 return isISOControl(cast(int)ch); 674 } 675 676 /** 677 * Determines if the referenced character (Unicode code point) is an ISO control 678 * character. A character is considered to be an ISO control 679 * character if its code is in the range {@code '\u005Cu0000'} 680 * through {@code '\u005Cu001F'} or in the range 681 * {@code '\u005Cu007F'} through {@code '\u005Cu009F'}. 682 * 683 * @param codePoint the character (Unicode code point) to be tested. 684 * @return {@code true} if the character is an ISO control character; 685 * {@code false} otherwise. 686 * @see Character#isSpaceChar(int) 687 * @see Character#isWhitespace(int) 688 * @since 1.5 689 */ 690 public static bool isISOControl(int codePoint) { 691 // Optimized form of: 692 // (codePoint >= 0x00 && codePoint <= 0x1F) || 693 // (codePoint >= 0x7F && codePoint <= 0x9F); 694 return codePoint <= 0x9F && 695 (codePoint >= 0x7F || (codePoint >>> 5 == 0)); 696 } 697 698 /** 699 * Converts the specified character (Unicode code point) to its 700 * UTF-16 representation stored in a {@code char} array. If 701 * the specified code point is a BMP (Basic Multilingual Plane or 702 * Plane 0) value, the resulting {@code char} array has 703 * the same value as {@code codePoint}. If the specified code 704 * point is a supplementary code point, the resulting 705 * {@code char} array has the corresponding surrogate pair. 706 * 707 * @param codePoint a Unicode code point 708 * @return a {@code char} array having 709 * {@code codePoint}'s UTF-16 representation. 710 * @throws IllegalArgumentException if the specified 711 * {@code codePoint} is not a valid Unicode code point. 712 * @since 1.5 713 */ 714 public static char[] toChars(int codePoint) { 715 if (isBmpCodePoint(codePoint)) { 716 return [ cast(char) codePoint ]; 717 } else if (isValidCodePoint(codePoint)) { 718 char[] result = new char[2]; 719 toSurrogates(codePoint, result, 0); 720 return result; 721 } else { 722 import std.string; 723 throw new IllegalArgumentException( 724 format("Not a valid Unicode code point: 0x%X", codePoint)); 725 } 726 } 727 728 /** 729 * Determines whether the specified character (Unicode code point) 730 * is in the <a href="#BMP">Basic Multilingual Plane (BMP)</a>. 731 * Such code points can be represented using a single {@code char}. 732 * 733 * @param codePoint the character (Unicode code point) to be tested 734 * @return {@code true} if the specified code point is between 735 * {@link #MIN_VALUE} and {@link #MAX_VALUE} inclusive; 736 * {@code false} otherwise. 737 * @since 1.7 738 */ 739 public static bool isBmpCodePoint(int codePoint) { 740 return codePoint >>> 16 == 0; 741 // Optimized form of: 742 // codePoint >= MIN_VALUE && codePoint <= MAX_VALUE 743 // We consistently use logical shift (>>>) to facilitate 744 // additional runtime optimizations. 745 } 746 747 /** 748 * Determines whether the specified code point is a valid 749 * <a href="http://www.unicode.org/glossary/#code_point"> 750 * Unicode code point value</a>. 751 * 752 * @param codePoint the Unicode code point to be tested 753 * @return {@code true} if the specified code point value is between 754 * {@link #MIN_CODE_POINT} and 755 * {@link #MAX_CODE_POINT} inclusive; 756 * {@code false} otherwise. 757 * @since 1.5 758 */ 759 public static bool isValidCodePoint(int codePoint) { 760 // Optimized form of: 761 // codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT 762 int plane = codePoint >>> 16; 763 return plane < ((MAX_CODE_POINT + 1) >>> 16); 764 } 765 766 static void toSurrogates(int codePoint, char[] dst, int index) { 767 // We write elements "backwards" to guarantee all-or-nothing 768 dst[index+1] = lowSurrogate(codePoint); 769 dst[index] = highSurrogate(codePoint); 770 } 771 772 /** 773 * Returns the trailing surrogate (a 774 * <a href="http://www.unicode.org/glossary/#low_surrogate_code_unit"> 775 * low surrogate code unit</a>) of the 776 * <a href="http://www.unicode.org/glossary/#surrogate_pair"> 777 * surrogate pair</a> 778 * representing the specified supplementary character (Unicode 779 * code point) in the UTF-16 encoding. If the specified character 780 * is not a 781 * <a href="Character.html#supplementary">supplementary character</a>, 782 * an unspecified {@code char} is returned. 783 * 784 * <p>If 785 * {@link #isSupplementaryCodePoint isSupplementaryCodePoint(x)} 786 * is {@code true}, then 787 * {@link #isLowSurrogate isLowSurrogate}{@code (lowSurrogate(x))} and 788 * {@link #toCodePoint toCodePoint}{@code (}{@link #highSurrogate highSurrogate}{@code (x), lowSurrogate(x)) == x} 789 * are also always {@code true}. 790 * 791 * @param codePoint a supplementary character (Unicode code point) 792 * @return the trailing surrogate code unit used to represent the 793 * character in the UTF-16 encoding 794 * @since 1.7 795 */ 796 public static char lowSurrogate(int codePoint) { 797 return cast(char) ((codePoint & 0x3ff) + MIN_LOW_SURROGATE); 798 } 799 800 801 /** 802 * Returns the leading surrogate (a 803 * <a href="http://www.unicode.org/glossary/#high_surrogate_code_unit"> 804 * high surrogate code unit</a>) of the 805 * <a href="http://www.unicode.org/glossary/#surrogate_pair"> 806 * surrogate pair</a> 807 * representing the specified supplementary character (Unicode 808 * code point) in the UTF-16 encoding. If the specified character 809 * is not a 810 * <a href="Character.html#supplementary">supplementary character</a>, 811 * an unspecified {@code char} is returned. 812 * 813 * <p>If 814 * {@link #isSupplementaryCodePoint isSupplementaryCodePoint(x)} 815 * is {@code true}, then 816 * {@link #isHighSurrogate isHighSurrogate}{@code (highSurrogate(x))} and 817 * {@link #toCodePoint toCodePoint}{@code (highSurrogate(x), }{@link #lowSurrogate lowSurrogate}{@code (x)) == x} 818 * are also always {@code true}. 819 * 820 * @param codePoint a supplementary character (Unicode code point) 821 * @return the leading surrogate code unit used to represent the 822 * character in the UTF-16 encoding 823 * @since 1.7 824 */ 825 public static char highSurrogate(int codePoint) { 826 return cast(char) ((codePoint >>> 10) 827 + (MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10))); 828 } 829 830 /** 831 * Returns the code point at the given index of the 832 * {@code CharSequence}. If the {@code char} value at 833 * the given index in the {@code CharSequence} is in the 834 * high-surrogate range, the following index is less than the 835 * length of the {@code CharSequence}, and the 836 * {@code char} value at the following index is in the 837 * low-surrogate range, then the supplementary code point 838 * corresponding to this surrogate pair is returned. Otherwise, 839 * the {@code char} value at the given index is returned. 840 * 841 * @param seq a sequence of {@code char} values (Unicode code 842 * units) 843 * @param index the index to the {@code char} values (Unicode 844 * code units) in {@code seq} to be converted 845 * @return the Unicode code point at the given index 846 * @throws NullPointerException if {@code seq} is null. 847 * @throws IndexOutOfBoundsException if the value 848 * {@code index} is negative or not less than 849 * {@link CharSequence#length() seq.length()}. 850 * @since 1.5 851 */ 852 public static int codePointAt(string seq, int index) { 853 char c1 = seq.charAt(index); 854 if (isHighSurrogate(c1) && ++index < seq.length) { 855 char c2 = seq.charAt(index); 856 if (isLowSurrogate(c2)) { 857 return toCodePoint(c1, c2); 858 } 859 } 860 return c1; 861 } 862 863 /** 864 * Determines if the given {@code char} value is a 865 * <a href="http://www.unicode.org/glossary/#high_surrogate_code_unit"> 866 * Unicode high-surrogate code unit</a> 867 * (also known as <i>leading-surrogate code unit</i>). 868 * 869 * <p>Such values do not represent characters by themselves, 870 * but are used in the representation of 871 * <a href="#supplementary">supplementary characters</a> 872 * in the UTF-16 encoding. 873 * 874 * @param ch the {@code char} value to be tested. 875 * @return {@code true} if the {@code char} value is between 876 * {@link #MIN_HIGH_SURROGATE} and 877 * {@link #MAX_HIGH_SURROGATE} inclusive; 878 * {@code false} otherwise. 879 * @see Character#isLowSurrogate(char) 880 * @see Character.UnicodeBlock#of(int) 881 * @since 1.5 882 */ 883 public static bool isHighSurrogate(char ch) { 884 // Help VM constant-fold; MAX_HIGH_SURROGATE + 1 == MIN_LOW_SURROGATE 885 return ch >= MIN_HIGH_SURROGATE && ch < (MAX_HIGH_SURROGATE + 1); 886 } 887 888 /** 889 * Determines if the given {@code char} value is a 890 * <a href="http://www.unicode.org/glossary/#low_surrogate_code_unit"> 891 * Unicode low-surrogate code unit</a> 892 * (also known as <i>trailing-surrogate code unit</i>). 893 * 894 * <p>Such values do not represent characters by themselves, 895 * but are used in the representation of 896 * <a href="#supplementary">supplementary characters</a> 897 * in the UTF-16 encoding. 898 * 899 * @param ch the {@code char} value to be tested. 900 * @return {@code true} if the {@code char} value is between 901 * {@link #MIN_LOW_SURROGATE} and 902 * {@link #MAX_LOW_SURROGATE} inclusive; 903 * {@code false} otherwise. 904 * @see Character#isHighSurrogate(char) 905 * @since 1.5 906 */ 907 public static bool isLowSurrogate(char ch) { 908 return ch >= MIN_LOW_SURROGATE && ch < (MAX_LOW_SURROGATE + 1); 909 } 910 911 /** 912 * Determines if the specified character is a letter. 913 * <p> 914 * A character is considered to be a letter if its general 915 * category type, provided by {@code Character.getType(ch)}, 916 * is any of the following: 917 * <ul> 918 * <li> {@code UPPERCASE_LETTER} 919 * <li> {@code LOWERCASE_LETTER} 920 * <li> {@code TITLECASE_LETTER} 921 * <li> {@code MODIFIER_LETTER} 922 * <li> {@code OTHER_LETTER} 923 * </ul> 924 * 925 * Not all letters have case. Many characters are 926 * letters but are neither uppercase nor lowercase nor titlecase. 927 * 928 * <p><b>Note:</b> This method cannot handle <a 929 * href="#supplementary"> supplementary characters</a>. To support 930 * all Unicode characters, including supplementary characters, use 931 * the {@link #isLetter(int)} method. 932 * 933 * @param ch the character to be tested. 934 * @return {@code true} if the character is a letter; 935 * {@code false} otherwise. 936 * @see Character#isDigit(char) 937 * @see Character#isJavaIdentifierStart(char) 938 * @see Character#isJavaLetter(char) 939 * @see Character#isJavaLetterOrDigit(char) 940 * @see Character#isLetterOrDigit(char) 941 * @see Character#isLowerCase(char) 942 * @see Character#isTitleCase(char) 943 * @see Character#isUnicodeIdentifierStart(char) 944 * @see Character#isUpperCase(char) 945 */ 946 public static bool isLetter(char ch) { 947 return isLetter(cast(int)ch); 948 } 949 950 /** 951 * Determines if the specified character (Unicode code point) is a letter. 952 * <p> 953 * A character is considered to be a letter if its general 954 * category type, provided by {@link Character#getType(int) getType(codePoint)}, 955 * is any of the following: 956 * <ul> 957 * <li> {@code UPPERCASE_LETTER} 958 * <li> {@code LOWERCASE_LETTER} 959 * <li> {@code TITLECASE_LETTER} 960 * <li> {@code MODIFIER_LETTER} 961 * <li> {@code OTHER_LETTER} 962 * </ul> 963 * 964 * Not all letters have case. Many characters are 965 * letters but are neither uppercase nor lowercase nor titlecase. 966 * 967 * @param codePoint the character (Unicode code point) to be tested. 968 * @return {@code true} if the character is a letter; 969 * {@code false} otherwise. 970 * @see Character#isDigit(int) 971 * @see Character#isJavaIdentifierStart(int) 972 * @see Character#isLetterOrDigit(int) 973 * @see Character#isLowerCase(int) 974 * @see Character#isTitleCase(int) 975 * @see Character#isUnicodeIdentifierStart(int) 976 * @see Character#isUpperCase(int) 977 * @since 1.5 978 */ 979 // public static bool isLetter(int codePoint) { 980 // return ((((1 << Char.UPPERCASE_LETTER) | 981 // (1 << Char.LOWERCASE_LETTER) | 982 // (1 << Char.TITLECASE_LETTER) | 983 // (1 << Char.MODIFIER_LETTER) | 984 // (1 << Char.OTHER_LETTER)) >> getType(codePoint)) & 1) 985 // != 0; 986 // } 987 } 988 989 private class CharacterCache { 990 private this() { 991 } 992 993 __gshared Char[] cache; 994 995 shared static this() { 996 cache = new Char[127 + 1]; 997 for (int i = 0; i < cast(int)cache.length; i++) { 998 cache[i] = new Char(cast(char) i); 999 } 1000 } 1001 }