index.js 63 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145
  1. 'use strict';
  2. var Preprocessor = require('./preprocessor'),
  3. UNICODE = require('../common/unicode'),
  4. neTree = require('./named_entity_data');
  5. //Aliases
  6. var $ = UNICODE.CODE_POINTS,
  7. $$ = UNICODE.CODE_POINT_SEQUENCES;
  8. //Replacement code points for numeric entities
  9. var NUMERIC_ENTITY_REPLACEMENTS = {
  10. 0x00: 0xFFFD, 0x0D: 0x000D, 0x80: 0x20AC, 0x81: 0x0081, 0x82: 0x201A, 0x83: 0x0192, 0x84: 0x201E,
  11. 0x85: 0x2026, 0x86: 0x2020, 0x87: 0x2021, 0x88: 0x02C6, 0x89: 0x2030, 0x8A: 0x0160, 0x8B: 0x2039,
  12. 0x8C: 0x0152, 0x8D: 0x008D, 0x8E: 0x017D, 0x8F: 0x008F, 0x90: 0x0090, 0x91: 0x2018, 0x92: 0x2019,
  13. 0x93: 0x201C, 0x94: 0x201D, 0x95: 0x2022, 0x96: 0x2013, 0x97: 0x2014, 0x98: 0x02DC, 0x99: 0x2122,
  14. 0x9A: 0x0161, 0x9B: 0x203A, 0x9C: 0x0153, 0x9D: 0x009D, 0x9E: 0x017E, 0x9F: 0x0178
  15. };
  16. // Named entity tree flags
  17. var HAS_DATA_FLAG = 1 << 0;
  18. var DATA_DUPLET_FLAG = 1 << 1;
  19. var HAS_BRANCHES_FLAG = 1 << 2;
  20. var MAX_BRANCH_MARKER_VALUE = HAS_DATA_FLAG | DATA_DUPLET_FLAG | HAS_BRANCHES_FLAG;
  21. //States
  22. var DATA_STATE = 'DATA_STATE',
  23. CHARACTER_REFERENCE_IN_DATA_STATE = 'CHARACTER_REFERENCE_IN_DATA_STATE',
  24. RCDATA_STATE = 'RCDATA_STATE',
  25. CHARACTER_REFERENCE_IN_RCDATA_STATE = 'CHARACTER_REFERENCE_IN_RCDATA_STATE',
  26. RAWTEXT_STATE = 'RAWTEXT_STATE',
  27. SCRIPT_DATA_STATE = 'SCRIPT_DATA_STATE',
  28. PLAINTEXT_STATE = 'PLAINTEXT_STATE',
  29. TAG_OPEN_STATE = 'TAG_OPEN_STATE',
  30. END_TAG_OPEN_STATE = 'END_TAG_OPEN_STATE',
  31. TAG_NAME_STATE = 'TAG_NAME_STATE',
  32. RCDATA_LESS_THAN_SIGN_STATE = 'RCDATA_LESS_THAN_SIGN_STATE',
  33. RCDATA_END_TAG_OPEN_STATE = 'RCDATA_END_TAG_OPEN_STATE',
  34. RCDATA_END_TAG_NAME_STATE = 'RCDATA_END_TAG_NAME_STATE',
  35. RAWTEXT_LESS_THAN_SIGN_STATE = 'RAWTEXT_LESS_THAN_SIGN_STATE',
  36. RAWTEXT_END_TAG_OPEN_STATE = 'RAWTEXT_END_TAG_OPEN_STATE',
  37. RAWTEXT_END_TAG_NAME_STATE = 'RAWTEXT_END_TAG_NAME_STATE',
  38. SCRIPT_DATA_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_LESS_THAN_SIGN_STATE',
  39. SCRIPT_DATA_END_TAG_OPEN_STATE = 'SCRIPT_DATA_END_TAG_OPEN_STATE',
  40. SCRIPT_DATA_END_TAG_NAME_STATE = 'SCRIPT_DATA_END_TAG_NAME_STATE',
  41. SCRIPT_DATA_ESCAPE_START_STATE = 'SCRIPT_DATA_ESCAPE_START_STATE',
  42. SCRIPT_DATA_ESCAPE_START_DASH_STATE = 'SCRIPT_DATA_ESCAPE_START_DASH_STATE',
  43. SCRIPT_DATA_ESCAPED_STATE = 'SCRIPT_DATA_ESCAPED_STATE',
  44. SCRIPT_DATA_ESCAPED_DASH_STATE = 'SCRIPT_DATA_ESCAPED_DASH_STATE',
  45. SCRIPT_DATA_ESCAPED_DASH_DASH_STATE = 'SCRIPT_DATA_ESCAPED_DASH_DASH_STATE',
  46. SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE',
  47. SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE = 'SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE',
  48. SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE = 'SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE',
  49. SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE',
  50. SCRIPT_DATA_DOUBLE_ESCAPED_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_STATE',
  51. SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE',
  52. SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE',
  53. SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE',
  54. SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE',
  55. BEFORE_ATTRIBUTE_NAME_STATE = 'BEFORE_ATTRIBUTE_NAME_STATE',
  56. ATTRIBUTE_NAME_STATE = 'ATTRIBUTE_NAME_STATE',
  57. AFTER_ATTRIBUTE_NAME_STATE = 'AFTER_ATTRIBUTE_NAME_STATE',
  58. BEFORE_ATTRIBUTE_VALUE_STATE = 'BEFORE_ATTRIBUTE_VALUE_STATE',
  59. ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE = 'ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE',
  60. ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE = 'ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE',
  61. ATTRIBUTE_VALUE_UNQUOTED_STATE = 'ATTRIBUTE_VALUE_UNQUOTED_STATE',
  62. CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE = 'CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE',
  63. AFTER_ATTRIBUTE_VALUE_QUOTED_STATE = 'AFTER_ATTRIBUTE_VALUE_QUOTED_STATE',
  64. SELF_CLOSING_START_TAG_STATE = 'SELF_CLOSING_START_TAG_STATE',
  65. BOGUS_COMMENT_STATE = 'BOGUS_COMMENT_STATE',
  66. BOGUS_COMMENT_STATE_CONTINUATION = 'BOGUS_COMMENT_STATE_CONTINUATION',
  67. MARKUP_DECLARATION_OPEN_STATE = 'MARKUP_DECLARATION_OPEN_STATE',
  68. COMMENT_START_STATE = 'COMMENT_START_STATE',
  69. COMMENT_START_DASH_STATE = 'COMMENT_START_DASH_STATE',
  70. COMMENT_STATE = 'COMMENT_STATE',
  71. COMMENT_END_DASH_STATE = 'COMMENT_END_DASH_STATE',
  72. COMMENT_END_STATE = 'COMMENT_END_STATE',
  73. COMMENT_END_BANG_STATE = 'COMMENT_END_BANG_STATE',
  74. DOCTYPE_STATE = 'DOCTYPE_STATE',
  75. DOCTYPE_NAME_STATE = 'DOCTYPE_NAME_STATE',
  76. AFTER_DOCTYPE_NAME_STATE = 'AFTER_DOCTYPE_NAME_STATE',
  77. BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE = 'BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE',
  78. DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE = 'DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE',
  79. DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE = 'DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE',
  80. BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE = 'BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE',
  81. BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE = 'BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE',
  82. DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE = 'DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE',
  83. DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE = 'DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE',
  84. AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE = 'AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE',
  85. BOGUS_DOCTYPE_STATE = 'BOGUS_DOCTYPE_STATE',
  86. CDATA_SECTION_STATE = 'CDATA_SECTION_STATE';
  87. //Utils
  88. //OPTIMIZATION: these utility functions should not be moved out of this module. V8 Crankshaft will not inline
  89. //this functions if they will be situated in another module due to context switch.
  90. //Always perform inlining check before modifying this functions ('node --trace-inlining').
  91. function isWhitespace(cp) {
  92. return cp === $.SPACE || cp === $.LINE_FEED || cp === $.TABULATION || cp === $.FORM_FEED;
  93. }
  94. function isAsciiDigit(cp) {
  95. return cp >= $.DIGIT_0 && cp <= $.DIGIT_9;
  96. }
  97. function isAsciiUpper(cp) {
  98. return cp >= $.LATIN_CAPITAL_A && cp <= $.LATIN_CAPITAL_Z;
  99. }
  100. function isAsciiLower(cp) {
  101. return cp >= $.LATIN_SMALL_A && cp <= $.LATIN_SMALL_Z;
  102. }
  103. function isAsciiLetter(cp) {
  104. return isAsciiLower(cp) || isAsciiUpper(cp);
  105. }
  106. function isAsciiAlphaNumeric(cp) {
  107. return isAsciiLetter(cp) || isAsciiDigit(cp);
  108. }
  109. function isDigit(cp, isHex) {
  110. return isAsciiDigit(cp) || isHex && (cp >= $.LATIN_CAPITAL_A && cp <= $.LATIN_CAPITAL_F ||
  111. cp >= $.LATIN_SMALL_A && cp <= $.LATIN_SMALL_F);
  112. }
  113. function isReservedCodePoint(cp) {
  114. return cp >= 0xD800 && cp <= 0xDFFF || cp > 0x10FFFF;
  115. }
  116. function toAsciiLowerCodePoint(cp) {
  117. return cp + 0x0020;
  118. }
  119. //NOTE: String.fromCharCode() function can handle only characters from BMP subset.
  120. //So, we need to workaround this manually.
  121. //(see: https://developer.mozilla.org/en-US/docs/JavaScript/Reference/Global_Objects/String/fromCharCode#Getting_it_to_work_with_higher_values)
  122. function toChar(cp) {
  123. if (cp <= 0xFFFF)
  124. return String.fromCharCode(cp);
  125. cp -= 0x10000;
  126. return String.fromCharCode(cp >>> 10 & 0x3FF | 0xD800) + String.fromCharCode(0xDC00 | cp & 0x3FF);
  127. }
  128. function toAsciiLowerChar(cp) {
  129. return String.fromCharCode(toAsciiLowerCodePoint(cp));
  130. }
  131. function findNamedEntityTreeBranch(nodeIx, cp) {
  132. var branchCount = neTree[++nodeIx],
  133. lo = ++nodeIx,
  134. hi = lo + branchCount - 1;
  135. while (lo <= hi) {
  136. var mid = lo + hi >>> 1,
  137. midCp = neTree[mid];
  138. if (midCp < cp)
  139. lo = mid + 1;
  140. else if (midCp > cp)
  141. hi = mid - 1;
  142. else
  143. return neTree[mid + branchCount];
  144. }
  145. return -1;
  146. }
  147. //Tokenizer
  148. var Tokenizer = module.exports = function () {
  149. this.preprocessor = new Preprocessor();
  150. this.tokenQueue = [];
  151. this.allowCDATA = false;
  152. this.state = DATA_STATE;
  153. this.returnState = '';
  154. this.tempBuff = [];
  155. this.additionalAllowedCp = void 0;
  156. this.lastStartTagName = '';
  157. this.consumedAfterSnapshot = -1;
  158. this.active = false;
  159. this.currentCharacterToken = null;
  160. this.currentToken = null;
  161. this.currentAttr = null;
  162. };
  163. //Token types
  164. Tokenizer.CHARACTER_TOKEN = 'CHARACTER_TOKEN';
  165. Tokenizer.NULL_CHARACTER_TOKEN = 'NULL_CHARACTER_TOKEN';
  166. Tokenizer.WHITESPACE_CHARACTER_TOKEN = 'WHITESPACE_CHARACTER_TOKEN';
  167. Tokenizer.START_TAG_TOKEN = 'START_TAG_TOKEN';
  168. Tokenizer.END_TAG_TOKEN = 'END_TAG_TOKEN';
  169. Tokenizer.COMMENT_TOKEN = 'COMMENT_TOKEN';
  170. Tokenizer.DOCTYPE_TOKEN = 'DOCTYPE_TOKEN';
  171. Tokenizer.EOF_TOKEN = 'EOF_TOKEN';
  172. Tokenizer.HIBERNATION_TOKEN = 'HIBERNATION_TOKEN';
  173. //Tokenizer initial states for different modes
  174. Tokenizer.MODE = {
  175. DATA: DATA_STATE,
  176. RCDATA: RCDATA_STATE,
  177. RAWTEXT: RAWTEXT_STATE,
  178. SCRIPT_DATA: SCRIPT_DATA_STATE,
  179. PLAINTEXT: PLAINTEXT_STATE
  180. };
  181. //Static
  182. Tokenizer.getTokenAttr = function (token, attrName) {
  183. for (var i = token.attrs.length - 1; i >= 0; i--) {
  184. if (token.attrs[i].name === attrName)
  185. return token.attrs[i].value;
  186. }
  187. return null;
  188. };
  189. //API
  190. Tokenizer.prototype.getNextToken = function () {
  191. while (!this.tokenQueue.length && this.active) {
  192. this._hibernationSnapshot();
  193. var cp = this._consume();
  194. if (!this._ensureHibernation())
  195. this[this.state](cp);
  196. }
  197. return this.tokenQueue.shift();
  198. };
  199. Tokenizer.prototype.write = function (chunk, isLastChunk) {
  200. this.active = true;
  201. this.preprocessor.write(chunk, isLastChunk);
  202. };
  203. Tokenizer.prototype.insertHtmlAtCurrentPos = function (chunk) {
  204. this.active = true;
  205. this.preprocessor.insertHtmlAtCurrentPos(chunk);
  206. };
  207. //Hibernation
  208. Tokenizer.prototype._hibernationSnapshot = function () {
  209. this.consumedAfterSnapshot = 0;
  210. };
  211. Tokenizer.prototype._ensureHibernation = function () {
  212. if (this.preprocessor.endOfChunkHit) {
  213. for (; this.consumedAfterSnapshot > 0; this.consumedAfterSnapshot--)
  214. this.preprocessor.retreat();
  215. this.active = false;
  216. this.tokenQueue.push({type: Tokenizer.HIBERNATION_TOKEN});
  217. return true;
  218. }
  219. return false;
  220. };
  221. //Consumption
  222. Tokenizer.prototype._consume = function () {
  223. this.consumedAfterSnapshot++;
  224. return this.preprocessor.advance();
  225. };
  226. Tokenizer.prototype._unconsume = function () {
  227. this.consumedAfterSnapshot--;
  228. this.preprocessor.retreat();
  229. };
  230. Tokenizer.prototype._unconsumeSeveral = function (count) {
  231. while (count--)
  232. this._unconsume();
  233. };
  234. Tokenizer.prototype._reconsumeInState = function (state) {
  235. this.state = state;
  236. this._unconsume();
  237. };
  238. Tokenizer.prototype._consumeSubsequentIfMatch = function (pattern, startCp, caseSensitive) {
  239. var consumedCount = 0,
  240. isMatch = true,
  241. patternLength = pattern.length,
  242. patternPos = 0,
  243. cp = startCp,
  244. patternCp = void 0;
  245. for (; patternPos < patternLength; patternPos++) {
  246. if (patternPos > 0) {
  247. cp = this._consume();
  248. consumedCount++;
  249. }
  250. if (cp === $.EOF) {
  251. isMatch = false;
  252. break;
  253. }
  254. patternCp = pattern[patternPos];
  255. if (cp !== patternCp && (caseSensitive || cp !== toAsciiLowerCodePoint(patternCp))) {
  256. isMatch = false;
  257. break;
  258. }
  259. }
  260. if (!isMatch)
  261. this._unconsumeSeveral(consumedCount);
  262. return isMatch;
  263. };
  264. //Lookahead
  265. Tokenizer.prototype._lookahead = function () {
  266. var cp = this._consume();
  267. this._unconsume();
  268. return cp;
  269. };
  270. //Temp buffer
  271. Tokenizer.prototype.isTempBufferEqualToScriptString = function () {
  272. if (this.tempBuff.length !== $$.SCRIPT_STRING.length)
  273. return false;
  274. for (var i = 0; i < this.tempBuff.length; i++) {
  275. if (this.tempBuff[i] !== $$.SCRIPT_STRING[i])
  276. return false;
  277. }
  278. return true;
  279. };
  280. //Token creation
  281. Tokenizer.prototype._createStartTagToken = function () {
  282. this.currentToken = {
  283. type: Tokenizer.START_TAG_TOKEN,
  284. tagName: '',
  285. selfClosing: false,
  286. attrs: []
  287. };
  288. };
  289. Tokenizer.prototype._createEndTagToken = function () {
  290. this.currentToken = {
  291. type: Tokenizer.END_TAG_TOKEN,
  292. tagName: '',
  293. attrs: []
  294. };
  295. };
  296. Tokenizer.prototype._createCommentToken = function () {
  297. this.currentToken = {
  298. type: Tokenizer.COMMENT_TOKEN,
  299. data: ''
  300. };
  301. };
  302. Tokenizer.prototype._createDoctypeToken = function (initialName) {
  303. this.currentToken = {
  304. type: Tokenizer.DOCTYPE_TOKEN,
  305. name: initialName,
  306. forceQuirks: false,
  307. publicId: null,
  308. systemId: null
  309. };
  310. };
  311. Tokenizer.prototype._createCharacterToken = function (type, ch) {
  312. this.currentCharacterToken = {
  313. type: type,
  314. chars: ch
  315. };
  316. };
  317. //Tag attributes
  318. Tokenizer.prototype._createAttr = function (attrNameFirstCh) {
  319. this.currentAttr = {
  320. name: attrNameFirstCh,
  321. value: ''
  322. };
  323. };
  324. Tokenizer.prototype._isDuplicateAttr = function () {
  325. return Tokenizer.getTokenAttr(this.currentToken, this.currentAttr.name) !== null;
  326. };
  327. Tokenizer.prototype._leaveAttrName = function (toState) {
  328. this.state = toState;
  329. if (!this._isDuplicateAttr())
  330. this.currentToken.attrs.push(this.currentAttr);
  331. };
  332. Tokenizer.prototype._leaveAttrValue = function (toState) {
  333. this.state = toState;
  334. };
  335. //Appropriate end tag token
  336. //(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#appropriate-end-tag-token)
  337. Tokenizer.prototype._isAppropriateEndTagToken = function () {
  338. return this.lastStartTagName === this.currentToken.tagName;
  339. };
  340. //Token emission
  341. Tokenizer.prototype._emitCurrentToken = function () {
  342. this._emitCurrentCharacterToken();
  343. //NOTE: store emited start tag's tagName to determine is the following end tag token is appropriate.
  344. if (this.currentToken.type === Tokenizer.START_TAG_TOKEN)
  345. this.lastStartTagName = this.currentToken.tagName;
  346. this.tokenQueue.push(this.currentToken);
  347. this.currentToken = null;
  348. };
  349. Tokenizer.prototype._emitCurrentCharacterToken = function () {
  350. if (this.currentCharacterToken) {
  351. this.tokenQueue.push(this.currentCharacterToken);
  352. this.currentCharacterToken = null;
  353. }
  354. };
  355. Tokenizer.prototype._emitEOFToken = function () {
  356. this._emitCurrentCharacterToken();
  357. this.tokenQueue.push({type: Tokenizer.EOF_TOKEN});
  358. };
  359. //Characters emission
  360. //OPTIMIZATION: specification uses only one type of character tokens (one token per character).
  361. //This causes a huge memory overhead and a lot of unnecessary parser loops. parse5 uses 3 groups of characters.
  362. //If we have a sequence of characters that belong to the same group, parser can process it
  363. //as a single solid character token.
  364. //So, there are 3 types of character tokens in parse5:
  365. //1)NULL_CHARACTER_TOKEN - \u0000-character sequences (e.g. '\u0000\u0000\u0000')
  366. //2)WHITESPACE_CHARACTER_TOKEN - any whitespace/new-line character sequences (e.g. '\n \r\t \f')
  367. //3)CHARACTER_TOKEN - any character sequence which don't belong to groups 1 and 2 (e.g. 'abcdef1234@@#$%^')
  368. Tokenizer.prototype._appendCharToCurrentCharacterToken = function (type, ch) {
  369. if (this.currentCharacterToken && this.currentCharacterToken.type !== type)
  370. this._emitCurrentCharacterToken();
  371. if (this.currentCharacterToken)
  372. this.currentCharacterToken.chars += ch;
  373. else
  374. this._createCharacterToken(type, ch);
  375. };
  376. Tokenizer.prototype._emitCodePoint = function (cp) {
  377. var type = Tokenizer.CHARACTER_TOKEN;
  378. if (isWhitespace(cp))
  379. type = Tokenizer.WHITESPACE_CHARACTER_TOKEN;
  380. else if (cp === $.NULL)
  381. type = Tokenizer.NULL_CHARACTER_TOKEN;
  382. this._appendCharToCurrentCharacterToken(type, toChar(cp));
  383. };
  384. Tokenizer.prototype._emitSeveralCodePoints = function (codePoints) {
  385. for (var i = 0; i < codePoints.length; i++)
  386. this._emitCodePoint(codePoints[i]);
  387. };
  388. //NOTE: used then we emit character explicitly. This is always a non-whitespace and a non-null character.
  389. //So we can avoid additional checks here.
  390. Tokenizer.prototype._emitChar = function (ch) {
  391. this._appendCharToCurrentCharacterToken(Tokenizer.CHARACTER_TOKEN, ch);
  392. };
  393. //Character reference tokenization
  394. Tokenizer.prototype._consumeNumericEntity = function (isHex) {
  395. var digits = '',
  396. nextCp = void 0;
  397. do {
  398. digits += toChar(this._consume());
  399. nextCp = this._lookahead();
  400. } while (nextCp !== $.EOF && isDigit(nextCp, isHex));
  401. if (this._lookahead() === $.SEMICOLON)
  402. this._consume();
  403. var referencedCp = parseInt(digits, isHex ? 16 : 10),
  404. replacement = NUMERIC_ENTITY_REPLACEMENTS[referencedCp];
  405. if (replacement)
  406. return replacement;
  407. if (isReservedCodePoint(referencedCp))
  408. return $.REPLACEMENT_CHARACTER;
  409. return referencedCp;
  410. };
  411. // NOTE: for the details on this algorithm see
  412. // https://github.com/inikulin/parse5/tree/master/scripts/generate_named_entity_data/README.md
  413. Tokenizer.prototype._consumeNamedEntity = function (inAttr) {
  414. var referencedCodePoints = null,
  415. referenceSize = 0,
  416. cp = null,
  417. consumedCount = 0,
  418. semicolonTerminated = false;
  419. for (var i = 0; i > -1;) {
  420. var current = neTree[i],
  421. inNode = current < MAX_BRANCH_MARKER_VALUE,
  422. nodeWithData = inNode && current & HAS_DATA_FLAG;
  423. if (nodeWithData) {
  424. referencedCodePoints = current & DATA_DUPLET_FLAG ? [neTree[++i], neTree[++i]] : [neTree[++i]];
  425. referenceSize = consumedCount;
  426. if (cp === $.SEMICOLON) {
  427. semicolonTerminated = true;
  428. break;
  429. }
  430. }
  431. cp = this._consume();
  432. consumedCount++;
  433. if (cp === $.EOF)
  434. break;
  435. if (inNode)
  436. i = current & HAS_BRANCHES_FLAG ? findNamedEntityTreeBranch(i, cp) : -1;
  437. else
  438. i = cp === current ? ++i : -1;
  439. }
  440. if (referencedCodePoints) {
  441. if (!semicolonTerminated) {
  442. //NOTE: unconsume excess (e.g. 'it' in '&notit')
  443. this._unconsumeSeveral(consumedCount - referenceSize);
  444. //NOTE: If the character reference is being consumed as part of an attribute and the next character
  445. //is either a U+003D EQUALS SIGN character (=) or an alphanumeric ASCII character, then, for historical
  446. //reasons, all the characters that were matched after the U+0026 AMPERSAND character (&) must be
  447. //unconsumed, and nothing is returned.
  448. //However, if this next character is in fact a U+003D EQUALS SIGN character (=), then this is a
  449. //parse error, because some legacy user agents will misinterpret the markup in those cases.
  450. //(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tokenizing-character-references)
  451. if (inAttr) {
  452. var nextCp = this._lookahead();
  453. if (nextCp === $.EQUALS_SIGN || isAsciiAlphaNumeric(nextCp)) {
  454. this._unconsumeSeveral(referenceSize);
  455. return null;
  456. }
  457. }
  458. }
  459. return referencedCodePoints;
  460. }
  461. this._unconsumeSeveral(consumedCount);
  462. return null;
  463. };
  464. Tokenizer.prototype._consumeCharacterReference = function (startCp, inAttr) {
  465. if (isWhitespace(startCp) || startCp === $.GREATER_THAN_SIGN ||
  466. startCp === $.AMPERSAND || startCp === this.additionalAllowedCp || startCp === $.EOF) {
  467. //NOTE: not a character reference. No characters are consumed, and nothing is returned.
  468. this._unconsume();
  469. return null;
  470. }
  471. if (startCp === $.NUMBER_SIGN) {
  472. //NOTE: we have a numeric entity candidate, now we should determine if it's hex or decimal
  473. var isHex = false,
  474. nextCp = this._lookahead();
  475. if (nextCp === $.LATIN_SMALL_X || nextCp === $.LATIN_CAPITAL_X) {
  476. this._consume();
  477. isHex = true;
  478. }
  479. nextCp = this._lookahead();
  480. //NOTE: if we have at least one digit this is a numeric entity for sure, so we consume it
  481. if (nextCp !== $.EOF && isDigit(nextCp, isHex))
  482. return [this._consumeNumericEntity(isHex)];
  483. //NOTE: otherwise this is a bogus number entity and a parse error. Unconsume the number sign
  484. //and the 'x'-character if appropriate.
  485. this._unconsumeSeveral(isHex ? 2 : 1);
  486. return null;
  487. }
  488. this._unconsume();
  489. return this._consumeNamedEntity(inAttr);
  490. };
  491. //State machine
  492. var _ = Tokenizer.prototype;
  493. //12.2.4.1 Data state
  494. //------------------------------------------------------------------
  495. _[DATA_STATE] = function dataState(cp) {
  496. this.preprocessor.dropParsedChunk();
  497. if (cp === $.AMPERSAND)
  498. this.state = CHARACTER_REFERENCE_IN_DATA_STATE;
  499. else if (cp === $.LESS_THAN_SIGN)
  500. this.state = TAG_OPEN_STATE;
  501. else if (cp === $.NULL)
  502. this._emitCodePoint(cp);
  503. else if (cp === $.EOF)
  504. this._emitEOFToken();
  505. else
  506. this._emitCodePoint(cp);
  507. };
  508. //12.2.4.2 Character reference in data state
  509. //------------------------------------------------------------------
  510. _[CHARACTER_REFERENCE_IN_DATA_STATE] = function characterReferenceInDataState(cp) {
  511. this.additionalAllowedCp = void 0;
  512. var referencedCodePoints = this._consumeCharacterReference(cp, false);
  513. if (!this._ensureHibernation()) {
  514. if (referencedCodePoints)
  515. this._emitSeveralCodePoints(referencedCodePoints);
  516. else
  517. this._emitChar('&');
  518. this.state = DATA_STATE;
  519. }
  520. };
  521. //12.2.4.3 RCDATA state
  522. //------------------------------------------------------------------
  523. _[RCDATA_STATE] = function rcdataState(cp) {
  524. this.preprocessor.dropParsedChunk();
  525. if (cp === $.AMPERSAND)
  526. this.state = CHARACTER_REFERENCE_IN_RCDATA_STATE;
  527. else if (cp === $.LESS_THAN_SIGN)
  528. this.state = RCDATA_LESS_THAN_SIGN_STATE;
  529. else if (cp === $.NULL)
  530. this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
  531. else if (cp === $.EOF)
  532. this._emitEOFToken();
  533. else
  534. this._emitCodePoint(cp);
  535. };
  536. //12.2.4.4 Character reference in RCDATA state
  537. //------------------------------------------------------------------
  538. _[CHARACTER_REFERENCE_IN_RCDATA_STATE] = function characterReferenceInRcdataState(cp) {
  539. this.additionalAllowedCp = void 0;
  540. var referencedCodePoints = this._consumeCharacterReference(cp, false);
  541. if (!this._ensureHibernation()) {
  542. if (referencedCodePoints)
  543. this._emitSeveralCodePoints(referencedCodePoints);
  544. else
  545. this._emitChar('&');
  546. this.state = RCDATA_STATE;
  547. }
  548. };
  549. //12.2.4.5 RAWTEXT state
  550. //------------------------------------------------------------------
  551. _[RAWTEXT_STATE] = function rawtextState(cp) {
  552. this.preprocessor.dropParsedChunk();
  553. if (cp === $.LESS_THAN_SIGN)
  554. this.state = RAWTEXT_LESS_THAN_SIGN_STATE;
  555. else if (cp === $.NULL)
  556. this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
  557. else if (cp === $.EOF)
  558. this._emitEOFToken();
  559. else
  560. this._emitCodePoint(cp);
  561. };
  562. //12.2.4.6 Script data state
  563. //------------------------------------------------------------------
  564. _[SCRIPT_DATA_STATE] = function scriptDataState(cp) {
  565. this.preprocessor.dropParsedChunk();
  566. if (cp === $.LESS_THAN_SIGN)
  567. this.state = SCRIPT_DATA_LESS_THAN_SIGN_STATE;
  568. else if (cp === $.NULL)
  569. this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
  570. else if (cp === $.EOF)
  571. this._emitEOFToken();
  572. else
  573. this._emitCodePoint(cp);
  574. };
  575. //12.2.4.7 PLAINTEXT state
  576. //------------------------------------------------------------------
  577. _[PLAINTEXT_STATE] = function plaintextState(cp) {
  578. this.preprocessor.dropParsedChunk();
  579. if (cp === $.NULL)
  580. this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
  581. else if (cp === $.EOF)
  582. this._emitEOFToken();
  583. else
  584. this._emitCodePoint(cp);
  585. };
  586. //12.2.4.8 Tag open state
  587. //------------------------------------------------------------------
  588. _[TAG_OPEN_STATE] = function tagOpenState(cp) {
  589. if (cp === $.EXCLAMATION_MARK)
  590. this.state = MARKUP_DECLARATION_OPEN_STATE;
  591. else if (cp === $.SOLIDUS)
  592. this.state = END_TAG_OPEN_STATE;
  593. else if (isAsciiLetter(cp)) {
  594. this._createStartTagToken();
  595. this._reconsumeInState(TAG_NAME_STATE);
  596. }
  597. else if (cp === $.QUESTION_MARK)
  598. this._reconsumeInState(BOGUS_COMMENT_STATE);
  599. else {
  600. this._emitChar('<');
  601. this._reconsumeInState(DATA_STATE);
  602. }
  603. };
  604. //12.2.4.9 End tag open state
  605. //------------------------------------------------------------------
  606. _[END_TAG_OPEN_STATE] = function endTagOpenState(cp) {
  607. if (isAsciiLetter(cp)) {
  608. this._createEndTagToken();
  609. this._reconsumeInState(TAG_NAME_STATE);
  610. }
  611. else if (cp === $.GREATER_THAN_SIGN)
  612. this.state = DATA_STATE;
  613. else if (cp === $.EOF) {
  614. this._reconsumeInState(DATA_STATE);
  615. this._emitChar('<');
  616. this._emitChar('/');
  617. }
  618. else
  619. this._reconsumeInState(BOGUS_COMMENT_STATE);
  620. };
  621. //12.2.4.10 Tag name state
  622. //------------------------------------------------------------------
  623. _[TAG_NAME_STATE] = function tagNameState(cp) {
  624. if (isWhitespace(cp))
  625. this.state = BEFORE_ATTRIBUTE_NAME_STATE;
  626. else if (cp === $.SOLIDUS)
  627. this.state = SELF_CLOSING_START_TAG_STATE;
  628. else if (cp === $.GREATER_THAN_SIGN) {
  629. this.state = DATA_STATE;
  630. this._emitCurrentToken();
  631. }
  632. else if (isAsciiUpper(cp))
  633. this.currentToken.tagName += toAsciiLowerChar(cp);
  634. else if (cp === $.NULL)
  635. this.currentToken.tagName += UNICODE.REPLACEMENT_CHARACTER;
  636. else if (cp === $.EOF)
  637. this._reconsumeInState(DATA_STATE);
  638. else
  639. this.currentToken.tagName += toChar(cp);
  640. };
  641. //12.2.4.11 RCDATA less-than sign state
  642. //------------------------------------------------------------------
  643. _[RCDATA_LESS_THAN_SIGN_STATE] = function rcdataLessThanSignState(cp) {
  644. if (cp === $.SOLIDUS) {
  645. this.tempBuff = [];
  646. this.state = RCDATA_END_TAG_OPEN_STATE;
  647. }
  648. else {
  649. this._emitChar('<');
  650. this._reconsumeInState(RCDATA_STATE);
  651. }
  652. };
  653. //12.2.4.12 RCDATA end tag open state
  654. //------------------------------------------------------------------
  655. _[RCDATA_END_TAG_OPEN_STATE] = function rcdataEndTagOpenState(cp) {
  656. if (isAsciiLetter(cp)) {
  657. this._createEndTagToken();
  658. this._reconsumeInState(RCDATA_END_TAG_NAME_STATE);
  659. }
  660. else {
  661. this._emitChar('<');
  662. this._emitChar('/');
  663. this._reconsumeInState(RCDATA_STATE);
  664. }
  665. };
  666. //12.2.4.13 RCDATA end tag name state
  667. //------------------------------------------------------------------
  668. _[RCDATA_END_TAG_NAME_STATE] = function rcdataEndTagNameState(cp) {
  669. if (isAsciiUpper(cp)) {
  670. this.currentToken.tagName += toAsciiLowerChar(cp);
  671. this.tempBuff.push(cp);
  672. }
  673. else if (isAsciiLower(cp)) {
  674. this.currentToken.tagName += toChar(cp);
  675. this.tempBuff.push(cp);
  676. }
  677. else {
  678. if (this._isAppropriateEndTagToken()) {
  679. if (isWhitespace(cp)) {
  680. this.state = BEFORE_ATTRIBUTE_NAME_STATE;
  681. return;
  682. }
  683. if (cp === $.SOLIDUS) {
  684. this.state = SELF_CLOSING_START_TAG_STATE;
  685. return;
  686. }
  687. if (cp === $.GREATER_THAN_SIGN) {
  688. this.state = DATA_STATE;
  689. this._emitCurrentToken();
  690. return;
  691. }
  692. }
  693. this._emitChar('<');
  694. this._emitChar('/');
  695. this._emitSeveralCodePoints(this.tempBuff);
  696. this._reconsumeInState(RCDATA_STATE);
  697. }
  698. };
  699. //12.2.4.14 RAWTEXT less-than sign state
  700. //------------------------------------------------------------------
  701. _[RAWTEXT_LESS_THAN_SIGN_STATE] = function rawtextLessThanSignState(cp) {
  702. if (cp === $.SOLIDUS) {
  703. this.tempBuff = [];
  704. this.state = RAWTEXT_END_TAG_OPEN_STATE;
  705. }
  706. else {
  707. this._emitChar('<');
  708. this._reconsumeInState(RAWTEXT_STATE);
  709. }
  710. };
  711. //12.2.4.15 RAWTEXT end tag open state
  712. //------------------------------------------------------------------
  713. _[RAWTEXT_END_TAG_OPEN_STATE] = function rawtextEndTagOpenState(cp) {
  714. if (isAsciiLetter(cp)) {
  715. this._createEndTagToken();
  716. this._reconsumeInState(RAWTEXT_END_TAG_NAME_STATE);
  717. }
  718. else {
  719. this._emitChar('<');
  720. this._emitChar('/');
  721. this._reconsumeInState(RAWTEXT_STATE);
  722. }
  723. };
  724. //12.2.4.16 RAWTEXT end tag name state
  725. //------------------------------------------------------------------
  726. _[RAWTEXT_END_TAG_NAME_STATE] = function rawtextEndTagNameState(cp) {
  727. if (isAsciiUpper(cp)) {
  728. this.currentToken.tagName += toAsciiLowerChar(cp);
  729. this.tempBuff.push(cp);
  730. }
  731. else if (isAsciiLower(cp)) {
  732. this.currentToken.tagName += toChar(cp);
  733. this.tempBuff.push(cp);
  734. }
  735. else {
  736. if (this._isAppropriateEndTagToken()) {
  737. if (isWhitespace(cp)) {
  738. this.state = BEFORE_ATTRIBUTE_NAME_STATE;
  739. return;
  740. }
  741. if (cp === $.SOLIDUS) {
  742. this.state = SELF_CLOSING_START_TAG_STATE;
  743. return;
  744. }
  745. if (cp === $.GREATER_THAN_SIGN) {
  746. this._emitCurrentToken();
  747. this.state = DATA_STATE;
  748. return;
  749. }
  750. }
  751. this._emitChar('<');
  752. this._emitChar('/');
  753. this._emitSeveralCodePoints(this.tempBuff);
  754. this._reconsumeInState(RAWTEXT_STATE);
  755. }
  756. };
  757. //12.2.4.17 Script data less-than sign state
  758. //------------------------------------------------------------------
  759. _[SCRIPT_DATA_LESS_THAN_SIGN_STATE] = function scriptDataLessThanSignState(cp) {
  760. if (cp === $.SOLIDUS) {
  761. this.tempBuff = [];
  762. this.state = SCRIPT_DATA_END_TAG_OPEN_STATE;
  763. }
  764. else if (cp === $.EXCLAMATION_MARK) {
  765. this.state = SCRIPT_DATA_ESCAPE_START_STATE;
  766. this._emitChar('<');
  767. this._emitChar('!');
  768. }
  769. else {
  770. this._emitChar('<');
  771. this._reconsumeInState(SCRIPT_DATA_STATE);
  772. }
  773. };
  774. //12.2.4.18 Script data end tag open state
  775. //------------------------------------------------------------------
  776. _[SCRIPT_DATA_END_TAG_OPEN_STATE] = function scriptDataEndTagOpenState(cp) {
  777. if (isAsciiLetter(cp)) {
  778. this._createEndTagToken();
  779. this._reconsumeInState(SCRIPT_DATA_END_TAG_NAME_STATE);
  780. }
  781. else {
  782. this._emitChar('<');
  783. this._emitChar('/');
  784. this._reconsumeInState(SCRIPT_DATA_STATE);
  785. }
  786. };
  787. //12.2.4.19 Script data end tag name state
  788. //------------------------------------------------------------------
  789. _[SCRIPT_DATA_END_TAG_NAME_STATE] = function scriptDataEndTagNameState(cp) {
  790. if (isAsciiUpper(cp)) {
  791. this.currentToken.tagName += toAsciiLowerChar(cp);
  792. this.tempBuff.push(cp);
  793. }
  794. else if (isAsciiLower(cp)) {
  795. this.currentToken.tagName += toChar(cp);
  796. this.tempBuff.push(cp);
  797. }
  798. else {
  799. if (this._isAppropriateEndTagToken()) {
  800. if (isWhitespace(cp)) {
  801. this.state = BEFORE_ATTRIBUTE_NAME_STATE;
  802. return;
  803. }
  804. else if (cp === $.SOLIDUS) {
  805. this.state = SELF_CLOSING_START_TAG_STATE;
  806. return;
  807. }
  808. else if (cp === $.GREATER_THAN_SIGN) {
  809. this._emitCurrentToken();
  810. this.state = DATA_STATE;
  811. return;
  812. }
  813. }
  814. this._emitChar('<');
  815. this._emitChar('/');
  816. this._emitSeveralCodePoints(this.tempBuff);
  817. this._reconsumeInState(SCRIPT_DATA_STATE);
  818. }
  819. };
  820. //12.2.4.20 Script data escape start state
  821. //------------------------------------------------------------------
  822. _[SCRIPT_DATA_ESCAPE_START_STATE] = function scriptDataEscapeStartState(cp) {
  823. if (cp === $.HYPHEN_MINUS) {
  824. this.state = SCRIPT_DATA_ESCAPE_START_DASH_STATE;
  825. this._emitChar('-');
  826. }
  827. else
  828. this._reconsumeInState(SCRIPT_DATA_STATE);
  829. };
  830. //12.2.4.21 Script data escape start dash state
  831. //------------------------------------------------------------------
  832. _[SCRIPT_DATA_ESCAPE_START_DASH_STATE] = function scriptDataEscapeStartDashState(cp) {
  833. if (cp === $.HYPHEN_MINUS) {
  834. this.state = SCRIPT_DATA_ESCAPED_DASH_DASH_STATE;
  835. this._emitChar('-');
  836. }
  837. else
  838. this._reconsumeInState(SCRIPT_DATA_STATE);
  839. };
  840. //12.2.4.22 Script data escaped state
  841. //------------------------------------------------------------------
  842. _[SCRIPT_DATA_ESCAPED_STATE] = function scriptDataEscapedState(cp) {
  843. if (cp === $.HYPHEN_MINUS) {
  844. this.state = SCRIPT_DATA_ESCAPED_DASH_STATE;
  845. this._emitChar('-');
  846. }
  847. else if (cp === $.LESS_THAN_SIGN)
  848. this.state = SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE;
  849. else if (cp === $.NULL)
  850. this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
  851. else if (cp === $.EOF)
  852. this._reconsumeInState(DATA_STATE);
  853. else
  854. this._emitCodePoint(cp);
  855. };
  856. //12.2.4.23 Script data escaped dash state
  857. //------------------------------------------------------------------
  858. _[SCRIPT_DATA_ESCAPED_DASH_STATE] = function scriptDataEscapedDashState(cp) {
  859. if (cp === $.HYPHEN_MINUS) {
  860. this.state = SCRIPT_DATA_ESCAPED_DASH_DASH_STATE;
  861. this._emitChar('-');
  862. }
  863. else if (cp === $.LESS_THAN_SIGN)
  864. this.state = SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE;
  865. else if (cp === $.NULL) {
  866. this.state = SCRIPT_DATA_ESCAPED_STATE;
  867. this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
  868. }
  869. else if (cp === $.EOF)
  870. this._reconsumeInState(DATA_STATE);
  871. else {
  872. this.state = SCRIPT_DATA_ESCAPED_STATE;
  873. this._emitCodePoint(cp);
  874. }
  875. };
  876. //12.2.4.24 Script data escaped dash dash state
  877. //------------------------------------------------------------------
  878. _[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE] = function scriptDataEscapedDashDashState(cp) {
  879. if (cp === $.HYPHEN_MINUS)
  880. this._emitChar('-');
  881. else if (cp === $.LESS_THAN_SIGN)
  882. this.state = SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE;
  883. else if (cp === $.GREATER_THAN_SIGN) {
  884. this.state = SCRIPT_DATA_STATE;
  885. this._emitChar('>');
  886. }
  887. else if (cp === $.NULL) {
  888. this.state = SCRIPT_DATA_ESCAPED_STATE;
  889. this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
  890. }
  891. else if (cp === $.EOF)
  892. this._reconsumeInState(DATA_STATE);
  893. else {
  894. this.state = SCRIPT_DATA_ESCAPED_STATE;
  895. this._emitCodePoint(cp);
  896. }
  897. };
  898. //12.2.4.25 Script data escaped less-than sign state
  899. //------------------------------------------------------------------
  900. _[SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE] = function scriptDataEscapedLessThanSignState(cp) {
  901. if (cp === $.SOLIDUS) {
  902. this.tempBuff = [];
  903. this.state = SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE;
  904. }
  905. else if (isAsciiLetter(cp)) {
  906. this.tempBuff = [];
  907. this._emitChar('<');
  908. this._reconsumeInState(SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE);
  909. }
  910. else {
  911. this._emitChar('<');
  912. this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE);
  913. }
  914. };
  915. //12.2.4.26 Script data escaped end tag open state
  916. //------------------------------------------------------------------
  917. _[SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE] = function scriptDataEscapedEndTagOpenState(cp) {
  918. if (isAsciiLetter(cp)) {
  919. this._createEndTagToken();
  920. this._reconsumeInState(SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE);
  921. }
  922. else {
  923. this._emitChar('<');
  924. this._emitChar('/');
  925. this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE);
  926. }
  927. };
  928. //12.2.4.27 Script data escaped end tag name state
  929. //------------------------------------------------------------------
  930. _[SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE] = function scriptDataEscapedEndTagNameState(cp) {
  931. if (isAsciiUpper(cp)) {
  932. this.currentToken.tagName += toAsciiLowerChar(cp);
  933. this.tempBuff.push(cp);
  934. }
  935. else if (isAsciiLower(cp)) {
  936. this.currentToken.tagName += toChar(cp);
  937. this.tempBuff.push(cp);
  938. }
  939. else {
  940. if (this._isAppropriateEndTagToken()) {
  941. if (isWhitespace(cp)) {
  942. this.state = BEFORE_ATTRIBUTE_NAME_STATE;
  943. return;
  944. }
  945. if (cp === $.SOLIDUS) {
  946. this.state = SELF_CLOSING_START_TAG_STATE;
  947. return;
  948. }
  949. if (cp === $.GREATER_THAN_SIGN) {
  950. this._emitCurrentToken();
  951. this.state = DATA_STATE;
  952. return;
  953. }
  954. }
  955. this._emitChar('<');
  956. this._emitChar('/');
  957. this._emitSeveralCodePoints(this.tempBuff);
  958. this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE);
  959. }
  960. };
  961. //12.2.4.28 Script data double escape start state
  962. //------------------------------------------------------------------
  963. _[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE] = function scriptDataDoubleEscapeStartState(cp) {
  964. if (isWhitespace(cp) || cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN) {
  965. this.state = this.isTempBufferEqualToScriptString() ? SCRIPT_DATA_DOUBLE_ESCAPED_STATE : SCRIPT_DATA_ESCAPED_STATE;
  966. this._emitCodePoint(cp);
  967. }
  968. else if (isAsciiUpper(cp)) {
  969. this.tempBuff.push(toAsciiLowerCodePoint(cp));
  970. this._emitCodePoint(cp);
  971. }
  972. else if (isAsciiLower(cp)) {
  973. this.tempBuff.push(cp);
  974. this._emitCodePoint(cp);
  975. }
  976. else
  977. this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE);
  978. };
  979. //12.2.4.29 Script data double escaped state
  980. //------------------------------------------------------------------
  981. _[SCRIPT_DATA_DOUBLE_ESCAPED_STATE] = function scriptDataDoubleEscapedState(cp) {
  982. if (cp === $.HYPHEN_MINUS) {
  983. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE;
  984. this._emitChar('-');
  985. }
  986. else if (cp === $.LESS_THAN_SIGN) {
  987. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE;
  988. this._emitChar('<');
  989. }
  990. else if (cp === $.NULL)
  991. this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
  992. else if (cp === $.EOF)
  993. this._reconsumeInState(DATA_STATE);
  994. else
  995. this._emitCodePoint(cp);
  996. };
  997. //12.2.4.30 Script data double escaped dash state
  998. //------------------------------------------------------------------
  999. _[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE] = function scriptDataDoubleEscapedDashState(cp) {
  1000. if (cp === $.HYPHEN_MINUS) {
  1001. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE;
  1002. this._emitChar('-');
  1003. }
  1004. else if (cp === $.LESS_THAN_SIGN) {
  1005. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE;
  1006. this._emitChar('<');
  1007. }
  1008. else if (cp === $.NULL) {
  1009. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
  1010. this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
  1011. }
  1012. else if (cp === $.EOF)
  1013. this._reconsumeInState(DATA_STATE);
  1014. else {
  1015. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
  1016. this._emitCodePoint(cp);
  1017. }
  1018. };
  1019. //12.2.4.31 Script data double escaped dash dash state
  1020. //------------------------------------------------------------------
  1021. _[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE] = function scriptDataDoubleEscapedDashDashState(cp) {
  1022. if (cp === $.HYPHEN_MINUS)
  1023. this._emitChar('-');
  1024. else if (cp === $.LESS_THAN_SIGN) {
  1025. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE;
  1026. this._emitChar('<');
  1027. }
  1028. else if (cp === $.GREATER_THAN_SIGN) {
  1029. this.state = SCRIPT_DATA_STATE;
  1030. this._emitChar('>');
  1031. }
  1032. else if (cp === $.NULL) {
  1033. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
  1034. this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
  1035. }
  1036. else if (cp === $.EOF)
  1037. this._reconsumeInState(DATA_STATE);
  1038. else {
  1039. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
  1040. this._emitCodePoint(cp);
  1041. }
  1042. };
  1043. //12.2.4.32 Script data double escaped less-than sign state
  1044. //------------------------------------------------------------------
  1045. _[SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE] = function scriptDataDoubleEscapedLessThanSignState(cp) {
  1046. if (cp === $.SOLIDUS) {
  1047. this.tempBuff = [];
  1048. this.state = SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE;
  1049. this._emitChar('/');
  1050. }
  1051. else
  1052. this._reconsumeInState(SCRIPT_DATA_DOUBLE_ESCAPED_STATE);
  1053. };
  1054. //12.2.4.33 Script data double escape end state
  1055. //------------------------------------------------------------------
  1056. _[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE] = function scriptDataDoubleEscapeEndState(cp) {
  1057. if (isWhitespace(cp) || cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN) {
  1058. this.state = this.isTempBufferEqualToScriptString() ? SCRIPT_DATA_ESCAPED_STATE : SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
  1059. this._emitCodePoint(cp);
  1060. }
  1061. else if (isAsciiUpper(cp)) {
  1062. this.tempBuff.push(toAsciiLowerCodePoint(cp));
  1063. this._emitCodePoint(cp);
  1064. }
  1065. else if (isAsciiLower(cp)) {
  1066. this.tempBuff.push(cp);
  1067. this._emitCodePoint(cp);
  1068. }
  1069. else
  1070. this._reconsumeInState(SCRIPT_DATA_DOUBLE_ESCAPED_STATE);
  1071. };
  1072. //12.2.4.34 Before attribute name state
  1073. //------------------------------------------------------------------
  1074. _[BEFORE_ATTRIBUTE_NAME_STATE] = function beforeAttributeNameState(cp) {
  1075. if (isWhitespace(cp))
  1076. return;
  1077. if (cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN || cp === $.EOF)
  1078. this._reconsumeInState(AFTER_ATTRIBUTE_NAME_STATE);
  1079. else if (cp === $.EQUALS_SIGN) {
  1080. this._createAttr('=');
  1081. this.state = ATTRIBUTE_NAME_STATE;
  1082. }
  1083. else {
  1084. this._createAttr('');
  1085. this._reconsumeInState(ATTRIBUTE_NAME_STATE);
  1086. }
  1087. };
  1088. //12.2.4.35 Attribute name state
  1089. //------------------------------------------------------------------
  1090. _[ATTRIBUTE_NAME_STATE] = function attributeNameState(cp) {
  1091. if (isWhitespace(cp) || cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN || cp === $.EOF) {
  1092. this._leaveAttrName(AFTER_ATTRIBUTE_NAME_STATE);
  1093. this._unconsume();
  1094. }
  1095. else if (cp === $.EQUALS_SIGN)
  1096. this._leaveAttrName(BEFORE_ATTRIBUTE_VALUE_STATE);
  1097. else if (isAsciiUpper(cp))
  1098. this.currentAttr.name += toAsciiLowerChar(cp);
  1099. else if (cp === $.QUOTATION_MARK || cp === $.APOSTROPHE || cp === $.LESS_THAN_SIGN)
  1100. this.currentAttr.name += toChar(cp);
  1101. else if (cp === $.NULL)
  1102. this.currentAttr.name += UNICODE.REPLACEMENT_CHARACTER;
  1103. else
  1104. this.currentAttr.name += toChar(cp);
  1105. };
  1106. //12.2.4.36 After attribute name state
  1107. //------------------------------------------------------------------
  1108. _[AFTER_ATTRIBUTE_NAME_STATE] = function afterAttributeNameState(cp) {
  1109. if (isWhitespace(cp))
  1110. return;
  1111. if (cp === $.SOLIDUS)
  1112. this.state = SELF_CLOSING_START_TAG_STATE;
  1113. else if (cp === $.EQUALS_SIGN)
  1114. this.state = BEFORE_ATTRIBUTE_VALUE_STATE;
  1115. else if (cp === $.GREATER_THAN_SIGN) {
  1116. this.state = DATA_STATE;
  1117. this._emitCurrentToken();
  1118. }
  1119. else if (cp === $.EOF)
  1120. this._reconsumeInState(DATA_STATE);
  1121. else {
  1122. this._createAttr('');
  1123. this._reconsumeInState(ATTRIBUTE_NAME_STATE);
  1124. }
  1125. };
  1126. //12.2.4.37 Before attribute value state
  1127. //------------------------------------------------------------------
  1128. _[BEFORE_ATTRIBUTE_VALUE_STATE] = function beforeAttributeValueState(cp) {
  1129. if (isWhitespace(cp))
  1130. return;
  1131. if (cp === $.QUOTATION_MARK)
  1132. this.state = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
  1133. else if (cp === $.APOSTROPHE)
  1134. this.state = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
  1135. else
  1136. this._reconsumeInState(ATTRIBUTE_VALUE_UNQUOTED_STATE);
  1137. };
  1138. //12.2.4.38 Attribute value (double-quoted) state
  1139. //------------------------------------------------------------------
  1140. _[ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE] = function attributeValueDoubleQuotedState(cp) {
  1141. if (cp === $.QUOTATION_MARK)
  1142. this.state = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
  1143. else if (cp === $.AMPERSAND) {
  1144. this.additionalAllowedCp = $.QUOTATION_MARK;
  1145. this.returnState = this.state;
  1146. this.state = CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE;
  1147. }
  1148. else if (cp === $.NULL)
  1149. this.currentAttr.value += UNICODE.REPLACEMENT_CHARACTER;
  1150. else if (cp === $.EOF)
  1151. this._reconsumeInState(DATA_STATE);
  1152. else
  1153. this.currentAttr.value += toChar(cp);
  1154. };
  1155. //12.2.4.39 Attribute value (single-quoted) state
  1156. //------------------------------------------------------------------
  1157. _[ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE] = function attributeValueSingleQuotedState(cp) {
  1158. if (cp === $.APOSTROPHE)
  1159. this.state = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
  1160. else if (cp === $.AMPERSAND) {
  1161. this.additionalAllowedCp = $.APOSTROPHE;
  1162. this.returnState = this.state;
  1163. this.state = CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE;
  1164. }
  1165. else if (cp === $.NULL)
  1166. this.currentAttr.value += UNICODE.REPLACEMENT_CHARACTER;
  1167. else if (cp === $.EOF)
  1168. this._reconsumeInState(DATA_STATE);
  1169. else
  1170. this.currentAttr.value += toChar(cp);
  1171. };
  1172. //12.2.4.40 Attribute value (unquoted) state
  1173. //------------------------------------------------------------------
  1174. _[ATTRIBUTE_VALUE_UNQUOTED_STATE] = function attributeValueUnquotedState(cp) {
  1175. if (isWhitespace(cp))
  1176. this._leaveAttrValue(BEFORE_ATTRIBUTE_NAME_STATE);
  1177. else if (cp === $.AMPERSAND) {
  1178. this.additionalAllowedCp = $.GREATER_THAN_SIGN;
  1179. this.returnState = this.state;
  1180. this.state = CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE;
  1181. }
  1182. else if (cp === $.GREATER_THAN_SIGN) {
  1183. this._leaveAttrValue(DATA_STATE);
  1184. this._emitCurrentToken();
  1185. }
  1186. else if (cp === $.NULL)
  1187. this.currentAttr.value += UNICODE.REPLACEMENT_CHARACTER;
  1188. else if (cp === $.QUOTATION_MARK || cp === $.APOSTROPHE || cp === $.LESS_THAN_SIGN ||
  1189. cp === $.EQUALS_SIGN || cp === $.GRAVE_ACCENT)
  1190. this.currentAttr.value += toChar(cp);
  1191. else if (cp === $.EOF)
  1192. this._reconsumeInState(DATA_STATE);
  1193. else
  1194. this.currentAttr.value += toChar(cp);
  1195. };
  1196. //12.2.4.41 Character reference in attribute value state
  1197. //------------------------------------------------------------------
  1198. _[CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE] = function characterReferenceInAttributeValueState(cp) {
  1199. var referencedCodePoints = this._consumeCharacterReference(cp, true);
  1200. if (!this._ensureHibernation()) {
  1201. if (referencedCodePoints) {
  1202. for (var i = 0; i < referencedCodePoints.length; i++)
  1203. this.currentAttr.value += toChar(referencedCodePoints[i]);
  1204. }
  1205. else
  1206. this.currentAttr.value += '&';
  1207. this.state = this.returnState;
  1208. }
  1209. };
  1210. //12.2.4.42 After attribute value (quoted) state
  1211. //------------------------------------------------------------------
  1212. _[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE] = function afterAttributeValueQuotedState(cp) {
  1213. if (isWhitespace(cp))
  1214. this._leaveAttrValue(BEFORE_ATTRIBUTE_NAME_STATE);
  1215. else if (cp === $.SOLIDUS)
  1216. this._leaveAttrValue(SELF_CLOSING_START_TAG_STATE);
  1217. else if (cp === $.GREATER_THAN_SIGN) {
  1218. this._leaveAttrValue(DATA_STATE);
  1219. this._emitCurrentToken();
  1220. }
  1221. else if (cp === $.EOF)
  1222. this._reconsumeInState(DATA_STATE);
  1223. else
  1224. this._reconsumeInState(BEFORE_ATTRIBUTE_NAME_STATE);
  1225. };
  1226. //12.2.4.43 Self-closing start tag state
  1227. //------------------------------------------------------------------
  1228. _[SELF_CLOSING_START_TAG_STATE] = function selfClosingStartTagState(cp) {
  1229. if (cp === $.GREATER_THAN_SIGN) {
  1230. this.currentToken.selfClosing = true;
  1231. this.state = DATA_STATE;
  1232. this._emitCurrentToken();
  1233. }
  1234. else if (cp === $.EOF)
  1235. this._reconsumeInState(DATA_STATE);
  1236. else
  1237. this._reconsumeInState(BEFORE_ATTRIBUTE_NAME_STATE);
  1238. };
  1239. //12.2.4.44 Bogus comment state
  1240. //------------------------------------------------------------------
  1241. _[BOGUS_COMMENT_STATE] = function bogusCommentState() {
  1242. this._createCommentToken();
  1243. this._reconsumeInState(BOGUS_COMMENT_STATE_CONTINUATION);
  1244. };
  1245. //HACK: to support streaming and make BOGUS_COMMENT_STATE reentrant we've
  1246. //introduced BOGUS_COMMENT_STATE_CONTINUATION state which will not produce
  1247. //comment token on each call.
  1248. _[BOGUS_COMMENT_STATE_CONTINUATION] = function bogusCommentStateContinuation(cp) {
  1249. while (true) {
  1250. if (cp === $.GREATER_THAN_SIGN) {
  1251. this.state = DATA_STATE;
  1252. break;
  1253. }
  1254. else if (cp === $.EOF) {
  1255. this._reconsumeInState(DATA_STATE);
  1256. break;
  1257. }
  1258. else {
  1259. this.currentToken.data += cp === $.NULL ? UNICODE.REPLACEMENT_CHARACTER : toChar(cp);
  1260. this._hibernationSnapshot();
  1261. cp = this._consume();
  1262. if (this._ensureHibernation())
  1263. return;
  1264. }
  1265. }
  1266. this._emitCurrentToken();
  1267. };
  1268. //12.2.4.45 Markup declaration open state
  1269. //------------------------------------------------------------------
  1270. _[MARKUP_DECLARATION_OPEN_STATE] = function markupDeclarationOpenState(cp) {
  1271. var dashDashMatch = this._consumeSubsequentIfMatch($$.DASH_DASH_STRING, cp, true),
  1272. doctypeMatch = !dashDashMatch && this._consumeSubsequentIfMatch($$.DOCTYPE_STRING, cp, false),
  1273. cdataMatch = !dashDashMatch && !doctypeMatch &&
  1274. this.allowCDATA &&
  1275. this._consumeSubsequentIfMatch($$.CDATA_START_STRING, cp, true);
  1276. if (!this._ensureHibernation()) {
  1277. if (dashDashMatch) {
  1278. this._createCommentToken();
  1279. this.state = COMMENT_START_STATE;
  1280. }
  1281. else if (doctypeMatch)
  1282. this.state = DOCTYPE_STATE;
  1283. else if (cdataMatch)
  1284. this.state = CDATA_SECTION_STATE;
  1285. else
  1286. this._reconsumeInState(BOGUS_COMMENT_STATE);
  1287. }
  1288. };
  1289. //12.2.4.46 Comment start state
  1290. //------------------------------------------------------------------
  1291. _[COMMENT_START_STATE] = function commentStartState(cp) {
  1292. if (cp === $.HYPHEN_MINUS)
  1293. this.state = COMMENT_START_DASH_STATE;
  1294. else if (cp === $.NULL) {
  1295. this.currentToken.data += UNICODE.REPLACEMENT_CHARACTER;
  1296. this.state = COMMENT_STATE;
  1297. }
  1298. else if (cp === $.GREATER_THAN_SIGN) {
  1299. this.state = DATA_STATE;
  1300. this._emitCurrentToken();
  1301. }
  1302. else if (cp === $.EOF) {
  1303. this._emitCurrentToken();
  1304. this._reconsumeInState(DATA_STATE);
  1305. }
  1306. else {
  1307. this.currentToken.data += toChar(cp);
  1308. this.state = COMMENT_STATE;
  1309. }
  1310. };
  1311. //12.2.4.47 Comment start dash state
  1312. //------------------------------------------------------------------
  1313. _[COMMENT_START_DASH_STATE] = function commentStartDashState(cp) {
  1314. if (cp === $.HYPHEN_MINUS)
  1315. this.state = COMMENT_END_STATE;
  1316. else if (cp === $.NULL) {
  1317. this.currentToken.data += '-';
  1318. this.currentToken.data += UNICODE.REPLACEMENT_CHARACTER;
  1319. this.state = COMMENT_STATE;
  1320. }
  1321. else if (cp === $.GREATER_THAN_SIGN) {
  1322. this.state = DATA_STATE;
  1323. this._emitCurrentToken();
  1324. }
  1325. else if (cp === $.EOF) {
  1326. this._emitCurrentToken();
  1327. this._reconsumeInState(DATA_STATE);
  1328. }
  1329. else {
  1330. this.currentToken.data += '-';
  1331. this.currentToken.data += toChar(cp);
  1332. this.state = COMMENT_STATE;
  1333. }
  1334. };
  1335. //12.2.4.48 Comment state
  1336. //------------------------------------------------------------------
  1337. _[COMMENT_STATE] = function commentState(cp) {
  1338. if (cp === $.HYPHEN_MINUS)
  1339. this.state = COMMENT_END_DASH_STATE;
  1340. else if (cp === $.NULL)
  1341. this.currentToken.data += UNICODE.REPLACEMENT_CHARACTER;
  1342. else if (cp === $.EOF) {
  1343. this._emitCurrentToken();
  1344. this._reconsumeInState(DATA_STATE);
  1345. }
  1346. else
  1347. this.currentToken.data += toChar(cp);
  1348. };
  1349. //12.2.4.49 Comment end dash state
  1350. //------------------------------------------------------------------
  1351. _[COMMENT_END_DASH_STATE] = function commentEndDashState(cp) {
  1352. if (cp === $.HYPHEN_MINUS)
  1353. this.state = COMMENT_END_STATE;
  1354. else if (cp === $.NULL) {
  1355. this.currentToken.data += '-';
  1356. this.currentToken.data += UNICODE.REPLACEMENT_CHARACTER;
  1357. this.state = COMMENT_STATE;
  1358. }
  1359. else if (cp === $.EOF) {
  1360. this._emitCurrentToken();
  1361. this._reconsumeInState(DATA_STATE);
  1362. }
  1363. else {
  1364. this.currentToken.data += '-';
  1365. this.currentToken.data += toChar(cp);
  1366. this.state = COMMENT_STATE;
  1367. }
  1368. };
  1369. //12.2.4.50 Comment end state
  1370. //------------------------------------------------------------------
  1371. _[COMMENT_END_STATE] = function commentEndState(cp) {
  1372. if (cp === $.GREATER_THAN_SIGN) {
  1373. this.state = DATA_STATE;
  1374. this._emitCurrentToken();
  1375. }
  1376. else if (cp === $.EXCLAMATION_MARK)
  1377. this.state = COMMENT_END_BANG_STATE;
  1378. else if (cp === $.HYPHEN_MINUS)
  1379. this.currentToken.data += '-';
  1380. else if (cp === $.NULL) {
  1381. this.currentToken.data += '--';
  1382. this.currentToken.data += UNICODE.REPLACEMENT_CHARACTER;
  1383. this.state = COMMENT_STATE;
  1384. }
  1385. else if (cp === $.EOF) {
  1386. this._reconsumeInState(DATA_STATE);
  1387. this._emitCurrentToken();
  1388. }
  1389. else {
  1390. this.currentToken.data += '--';
  1391. this.currentToken.data += toChar(cp);
  1392. this.state = COMMENT_STATE;
  1393. }
  1394. };
  1395. //12.2.4.51 Comment end bang state
  1396. //------------------------------------------------------------------
  1397. _[COMMENT_END_BANG_STATE] = function commentEndBangState(cp) {
  1398. if (cp === $.HYPHEN_MINUS) {
  1399. this.currentToken.data += '--!';
  1400. this.state = COMMENT_END_DASH_STATE;
  1401. }
  1402. else if (cp === $.GREATER_THAN_SIGN) {
  1403. this.state = DATA_STATE;
  1404. this._emitCurrentToken();
  1405. }
  1406. else if (cp === $.NULL) {
  1407. this.currentToken.data += '--!';
  1408. this.currentToken.data += UNICODE.REPLACEMENT_CHARACTER;
  1409. this.state = COMMENT_STATE;
  1410. }
  1411. else if (cp === $.EOF) {
  1412. this._emitCurrentToken();
  1413. this._reconsumeInState(DATA_STATE);
  1414. }
  1415. else {
  1416. this.currentToken.data += '--!';
  1417. this.currentToken.data += toChar(cp);
  1418. this.state = COMMENT_STATE;
  1419. }
  1420. };
  1421. //12.2.4.52 DOCTYPE state
  1422. //------------------------------------------------------------------
  1423. _[DOCTYPE_STATE] = function doctypeState(cp) {
  1424. if (isWhitespace(cp))
  1425. return;
  1426. else if (cp === $.GREATER_THAN_SIGN) {
  1427. this._createDoctypeToken(null);
  1428. this.currentToken.forceQuirks = true;
  1429. this._emitCurrentToken();
  1430. this.state = DATA_STATE;
  1431. }
  1432. else if (cp === $.EOF) {
  1433. this._createDoctypeToken(null);
  1434. this.currentToken.forceQuirks = true;
  1435. this._emitCurrentToken();
  1436. this._reconsumeInState(DATA_STATE);
  1437. }
  1438. else {
  1439. this._createDoctypeToken('');
  1440. this._reconsumeInState(DOCTYPE_NAME_STATE);
  1441. }
  1442. };
  1443. //12.2.4.54 DOCTYPE name state
  1444. //------------------------------------------------------------------
  1445. _[DOCTYPE_NAME_STATE] = function doctypeNameState(cp) {
  1446. if (isWhitespace(cp) || cp === $.GREATER_THAN_SIGN || cp === $.EOF)
  1447. this._reconsumeInState(AFTER_DOCTYPE_NAME_STATE);
  1448. else if (isAsciiUpper(cp))
  1449. this.currentToken.name += toAsciiLowerChar(cp);
  1450. else if (cp === $.NULL)
  1451. this.currentToken.name += UNICODE.REPLACEMENT_CHARACTER;
  1452. else
  1453. this.currentToken.name += toChar(cp);
  1454. };
  1455. //12.2.4.55 After DOCTYPE name state
  1456. //------------------------------------------------------------------
  1457. _[AFTER_DOCTYPE_NAME_STATE] = function afterDoctypeNameState(cp) {
  1458. if (isWhitespace(cp))
  1459. return;
  1460. if (cp === $.GREATER_THAN_SIGN) {
  1461. this.state = DATA_STATE;
  1462. this._emitCurrentToken();
  1463. }
  1464. else {
  1465. var publicMatch = this._consumeSubsequentIfMatch($$.PUBLIC_STRING, cp, false),
  1466. systemMatch = !publicMatch && this._consumeSubsequentIfMatch($$.SYSTEM_STRING, cp, false);
  1467. if (!this._ensureHibernation()) {
  1468. if (publicMatch)
  1469. this.state = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
  1470. else if (systemMatch)
  1471. this.state = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
  1472. else {
  1473. this.currentToken.forceQuirks = true;
  1474. this.state = BOGUS_DOCTYPE_STATE;
  1475. }
  1476. }
  1477. }
  1478. };
  1479. //12.2.4.57 Before DOCTYPE public identifier state
  1480. //------------------------------------------------------------------
  1481. _[BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE] = function beforeDoctypePublicIdentifierState(cp) {
  1482. if (isWhitespace(cp))
  1483. return;
  1484. if (cp === $.QUOTATION_MARK) {
  1485. this.currentToken.publicId = '';
  1486. this.state = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
  1487. }
  1488. else if (cp === $.APOSTROPHE) {
  1489. this.currentToken.publicId = '';
  1490. this.state = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
  1491. }
  1492. else {
  1493. this.currentToken.forceQuirks = true;
  1494. this._reconsumeInState(BOGUS_DOCTYPE_STATE);
  1495. }
  1496. };
  1497. //12.2.4.58 DOCTYPE public identifier (double-quoted) state
  1498. //------------------------------------------------------------------
  1499. _[DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE] = function doctypePublicIdentifierDoubleQuotedState(cp) {
  1500. if (cp === $.QUOTATION_MARK)
  1501. this.state = BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE;
  1502. else if (cp === $.NULL)
  1503. this.currentToken.publicId += UNICODE.REPLACEMENT_CHARACTER;
  1504. else if (cp === $.GREATER_THAN_SIGN) {
  1505. this.currentToken.forceQuirks = true;
  1506. this._emitCurrentToken();
  1507. this.state = DATA_STATE;
  1508. }
  1509. else if (cp === $.EOF) {
  1510. this.currentToken.forceQuirks = true;
  1511. this._emitCurrentToken();
  1512. this._reconsumeInState(DATA_STATE);
  1513. }
  1514. else
  1515. this.currentToken.publicId += toChar(cp);
  1516. };
  1517. //12.2.4.59 DOCTYPE public identifier (single-quoted) state
  1518. //------------------------------------------------------------------
  1519. _[DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE] = function doctypePublicIdentifierSingleQuotedState(cp) {
  1520. if (cp === $.APOSTROPHE)
  1521. this.state = BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE;
  1522. else if (cp === $.NULL)
  1523. this.currentToken.publicId += UNICODE.REPLACEMENT_CHARACTER;
  1524. else if (cp === $.GREATER_THAN_SIGN) {
  1525. this.currentToken.forceQuirks = true;
  1526. this._emitCurrentToken();
  1527. this.state = DATA_STATE;
  1528. }
  1529. else if (cp === $.EOF) {
  1530. this.currentToken.forceQuirks = true;
  1531. this._emitCurrentToken();
  1532. this._reconsumeInState(DATA_STATE);
  1533. }
  1534. else
  1535. this.currentToken.publicId += toChar(cp);
  1536. };
  1537. //12.2.4.61 Between DOCTYPE public and system identifiers state
  1538. //------------------------------------------------------------------
  1539. _[BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE] = function betweenDoctypePublicAndSystemIdentifiersState(cp) {
  1540. if (isWhitespace(cp))
  1541. return;
  1542. if (cp === $.GREATER_THAN_SIGN) {
  1543. this._emitCurrentToken();
  1544. this.state = DATA_STATE;
  1545. }
  1546. else if (cp === $.QUOTATION_MARK) {
  1547. this.currentToken.systemId = '';
  1548. this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
  1549. }
  1550. else if (cp === $.APOSTROPHE) {
  1551. this.currentToken.systemId = '';
  1552. this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
  1553. }
  1554. else {
  1555. this.currentToken.forceQuirks = true;
  1556. this._reconsumeInState(BOGUS_DOCTYPE_STATE);
  1557. }
  1558. };
  1559. //12.2.4.63 Before DOCTYPE system identifier state
  1560. //------------------------------------------------------------------
  1561. _[BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE] = function beforeDoctypeSystemIdentifierState(cp) {
  1562. if (isWhitespace(cp))
  1563. return;
  1564. if (cp === $.QUOTATION_MARK) {
  1565. this.currentToken.systemId = '';
  1566. this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
  1567. }
  1568. else if (cp === $.APOSTROPHE) {
  1569. this.currentToken.systemId = '';
  1570. this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
  1571. }
  1572. else {
  1573. this.currentToken.forceQuirks = true;
  1574. this._reconsumeInState(BOGUS_DOCTYPE_STATE);
  1575. }
  1576. };
  1577. //12.2.4.64 DOCTYPE system identifier (double-quoted) state
  1578. //------------------------------------------------------------------
  1579. _[DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE] = function doctypeSystemIdentifierDoubleQuotedState(cp) {
  1580. if (cp === $.QUOTATION_MARK)
  1581. this.state = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
  1582. else if (cp === $.GREATER_THAN_SIGN) {
  1583. this.currentToken.forceQuirks = true;
  1584. this._emitCurrentToken();
  1585. this.state = DATA_STATE;
  1586. }
  1587. else if (cp === $.NULL)
  1588. this.currentToken.systemId += UNICODE.REPLACEMENT_CHARACTER;
  1589. else if (cp === $.EOF) {
  1590. this.currentToken.forceQuirks = true;
  1591. this._emitCurrentToken();
  1592. this._reconsumeInState(DATA_STATE);
  1593. }
  1594. else
  1595. this.currentToken.systemId += toChar(cp);
  1596. };
  1597. //12.2.4.65 DOCTYPE system identifier (single-quoted) state
  1598. //------------------------------------------------------------------
  1599. _[DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE] = function doctypeSystemIdentifierSingleQuotedState(cp) {
  1600. if (cp === $.APOSTROPHE)
  1601. this.state = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
  1602. else if (cp === $.GREATER_THAN_SIGN) {
  1603. this.currentToken.forceQuirks = true;
  1604. this._emitCurrentToken();
  1605. this.state = DATA_STATE;
  1606. }
  1607. else if (cp === $.NULL)
  1608. this.currentToken.systemId += UNICODE.REPLACEMENT_CHARACTER;
  1609. else if (cp === $.EOF) {
  1610. this.currentToken.forceQuirks = true;
  1611. this._emitCurrentToken();
  1612. this._reconsumeInState(DATA_STATE);
  1613. }
  1614. else
  1615. this.currentToken.systemId += toChar(cp);
  1616. };
  1617. //12.2.4.66 After DOCTYPE system identifier state
  1618. //------------------------------------------------------------------
  1619. _[AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE] = function afterDoctypeSystemIdentifierState(cp) {
  1620. if (isWhitespace(cp))
  1621. return;
  1622. if (cp === $.GREATER_THAN_SIGN) {
  1623. this._emitCurrentToken();
  1624. this.state = DATA_STATE;
  1625. }
  1626. else if (cp === $.EOF) {
  1627. this.currentToken.forceQuirks = true;
  1628. this._emitCurrentToken();
  1629. this._reconsumeInState(DATA_STATE);
  1630. }
  1631. else
  1632. this.state = BOGUS_DOCTYPE_STATE;
  1633. };
  1634. //12.2.4.67 Bogus DOCTYPE state
  1635. //------------------------------------------------------------------
  1636. _[BOGUS_DOCTYPE_STATE] = function bogusDoctypeState(cp) {
  1637. if (cp === $.GREATER_THAN_SIGN) {
  1638. this._emitCurrentToken();
  1639. this.state = DATA_STATE;
  1640. }
  1641. else if (cp === $.EOF) {
  1642. this._emitCurrentToken();
  1643. this._reconsumeInState(DATA_STATE);
  1644. }
  1645. };
  1646. //12.2.4.68 CDATA section state
  1647. //------------------------------------------------------------------
  1648. _[CDATA_SECTION_STATE] = function cdataSectionState(cp) {
  1649. while (true) {
  1650. if (cp === $.EOF) {
  1651. this._reconsumeInState(DATA_STATE);
  1652. break;
  1653. }
  1654. else {
  1655. var cdataEndMatch = this._consumeSubsequentIfMatch($$.CDATA_END_STRING, cp, true);
  1656. if (this._ensureHibernation())
  1657. break;
  1658. if (cdataEndMatch) {
  1659. this.state = DATA_STATE;
  1660. break;
  1661. }
  1662. this._emitCodePoint(cp);
  1663. this._hibernationSnapshot();
  1664. cp = this._consume();
  1665. if (this._ensureHibernation())
  1666. break;
  1667. }
  1668. }
  1669. };