UnicodeCJK.js 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. /**
  2. * Copyright (c) 2013-present, Facebook, Inc.
  3. *
  4. * This source code is licensed under the MIT license found in the
  5. * LICENSE file in the root directory of this source tree.
  6. *
  7. * @typechecks
  8. */
  9. /**
  10. * Unicode algorithms for CJK (Chinese, Japanese, Korean) writing systems.
  11. *
  12. * Utilities for Hanzi/Kanji/Hanja logographs and Kanas (Katakana and Hiragana)
  13. * syllables.
  14. *
  15. * For Korean Hangul see module `UnicodeHangulKorean`.
  16. */
  17. 'use strict';
  18. /**
  19. * Latin
  20. *
  21. * NOTE: The code assumes these sets include only BMP characters.
  22. */
  23. var R_LATIN_ASCII = 'a-zA-Z';
  24. var R_LATIN_FULLWIDTH = "\uFF21-\uFF3A\uFF41-\uFF5A";
  25. var R_LATIN = R_LATIN_ASCII + R_LATIN_FULLWIDTH;
  26. /**
  27. * Hiragana & Katakana
  28. *
  29. * NOTE: Some ranges include non-BMP characters. We do not support those ranges
  30. * for now.
  31. */
  32. var R_HIRAGANA = "\u3040-\u309F";
  33. var R_KATAKANA = "\u30A0-\u30FF";
  34. var R_KATAKANA_PHONETIC = "\u31F0-\u31FF";
  35. var R_KATAKANA_HALFWIDTH = "\uFF65-\uFF9F"; // var R_KANA_SUPPLEMENT = '\U0001B000-\U0001B0FF';
  36. var R_KATAKANA_ALL = R_KATAKANA + R_KATAKANA_PHONETIC + R_KATAKANA_HALFWIDTH;
  37. var R_KANA = R_HIRAGANA + R_KATAKANA_ALL;
  38. var I_HIRAGANA = [0x3040, 0x309F];
  39. var I_KATAKANA = [0x30A0, 0x30FF];
  40. var I_HIRAGANA_TO_KATAKANA = I_KATAKANA[0] - I_HIRAGANA[0];
  41. /**
  42. * Hanzi/Kanji/Hanja
  43. *
  44. * NOTE: Some ranges include non-BMP characters. We do not support those ranges
  45. * for now.
  46. */
  47. var R_IDEO_MAIN = "\u4E00-\u9FCF";
  48. var R_IDEO_EXT_A = "\u3400-\u4DBF"; // var R_IDEO_EXT_B = '\U00020000-\U0002A6DF';
  49. // var R_IDEO_EXT_C = '\U0002A700-\U0002B73F';
  50. // var R_IDEO_EXT_D = '\U0002B740-\U0002B81F';
  51. var R_IDEO = R_IDEO_MAIN + R_IDEO_EXT_A;
  52. /**
  53. * Hangul
  54. */
  55. // var R_HANGUL_JAMO = '\u1100-\u11FF';
  56. // var R_HANGUL_JAMO_EXT_A = '\uA960-\uA97F';
  57. // var R_HANGUL_JAMO_EXT_B = '\uD7B0-\uD7FF';
  58. // var R_HANGUL_COMPATIBILITY = '\u3130-\u318F';
  59. // var R_HANGUL_COMP_HALFWIDTH = '\uFFA0-\uFFDF';
  60. var R_HANGUL_SYLLABLES = "\uAC00-\uD7AF";
  61. /**
  62. * Globals
  63. */
  64. var R_IDEO_OR_SYLL = R_IDEO + R_KANA + R_HANGUL_SYLLABLES;
  65. var REGEX_IDEO = null;
  66. var REGEX_KANA = null;
  67. var REGEX_IDEO_OR_SYLL = null;
  68. var REGEX_IS_KANA_WITH_TRAILING_LATIN = null;
  69. /**
  70. * Whether the string includes any Katakana or Hiragana characters.
  71. *
  72. * @param {string} str
  73. * @return {boolean}
  74. */
  75. function hasKana(str) {
  76. REGEX_KANA = REGEX_KANA || new RegExp('[' + R_KANA + ']');
  77. return REGEX_KANA.test(str);
  78. }
  79. /**
  80. * Whether the string includes any CJK Ideograph characters.
  81. *
  82. * @param {string} str
  83. * @return {boolean}
  84. */
  85. function hasIdeograph(str) {
  86. REGEX_IDEO = REGEX_IDEO || new RegExp('[' + R_IDEO + ']');
  87. return REGEX_IDEO.test(str);
  88. }
  89. /**
  90. * Whether the string includes any CJK Ideograph or Syllable characters.
  91. *
  92. * @param {string} str
  93. * @return {boolean}
  94. */
  95. function hasIdeoOrSyll(str) {
  96. REGEX_IDEO_OR_SYLL = REGEX_IDEO_OR_SYLL || new RegExp('[' + R_IDEO_OR_SYLL + ']');
  97. return REGEX_IDEO_OR_SYLL.test(str);
  98. }
  99. /**
  100. * @param {string} chr
  101. * @output {string}
  102. */
  103. function charCodeToKatakana(chr) {
  104. var charCode = chr.charCodeAt(0);
  105. return String.fromCharCode(charCode < I_HIRAGANA[0] || charCode > I_HIRAGANA[1] ? charCode : charCode + I_HIRAGANA_TO_KATAKANA);
  106. }
  107. /**
  108. * Replace any Hiragana character with the matching Katakana
  109. *
  110. * @param {string} str
  111. * @output {string}
  112. */
  113. function hiraganaToKatakana(str) {
  114. if (!hasKana(str)) {
  115. return str;
  116. }
  117. return str.split('').map(charCodeToKatakana).join('');
  118. }
  119. /**
  120. * Whether the string is exactly a sequence of Kana characters followed by one
  121. * Latin character.
  122. *
  123. * @param {string} str
  124. * @output {string}
  125. */
  126. function isKanaWithTrailingLatin(str) {
  127. REGEX_IS_KANA_WITH_TRAILING_LATIN = REGEX_IS_KANA_WITH_TRAILING_LATIN || new RegExp('^' + '[' + R_KANA + ']+' + '[' + R_LATIN + ']' + '$');
  128. return REGEX_IS_KANA_WITH_TRAILING_LATIN.test(str);
  129. }
  130. /**
  131. * Drops the trailing Latin character from a string that is exactly a sequence
  132. * of Kana characters followed by one Latin character.
  133. *
  134. * @param {string} str
  135. * @output {string}
  136. */
  137. function kanaRemoveTrailingLatin(str) {
  138. if (isKanaWithTrailingLatin(str)) {
  139. return str.substr(0, str.length - 1);
  140. }
  141. return str;
  142. }
  143. var UnicodeCJK = {
  144. hasKana: hasKana,
  145. hasIdeograph: hasIdeograph,
  146. hasIdeoOrSyll: hasIdeoOrSyll,
  147. hiraganaToKatakana: hiraganaToKatakana,
  148. isKanaWithTrailingLatin: isKanaWithTrailingLatin,
  149. kanaRemoveTrailingLatin: kanaRemoveTrailingLatin
  150. };
  151. module.exports = UnicodeCJK;