// Scintilla source code edit control /** @file LexHTML.cxx ** Lexer for HTML. **/ // Copyright 1998-2005 by Neil Hodgson // The License.txt file describes the conditions under which this software may be distributed. #include #include #include #include #include #include "Platform.h" #include "PropSet.h" #include "Accessor.h" #include "StyleContext.h" #include "KeyWords.h" #include "Scintilla.h" #include "SciLexer.h" #define SCE_HA_JS (SCE_HJA_START - SCE_HJ_START) #define SCE_HA_VBS (SCE_HBA_START - SCE_HB_START) #define SCE_HA_PYTHON (SCE_HPA_START - SCE_HP_START) enum script_type { eScriptNone = 0, eScriptJS, eScriptVBS, eScriptPython, eScriptPHP, eScriptXML, eScriptSGML, eScriptSGMLblock }; enum script_mode { eHtml = 0, eNonHtmlScript, eNonHtmlPreProc, eNonHtmlScriptPreProc }; static inline bool IsAWordChar(const int ch) { return (ch < 0x80) && (isalnum(ch) || ch == '.' || ch == '_'); } static inline bool IsAWordStart(const int ch) { return (ch < 0x80) && (isalnum(ch) || ch == '_'); } static inline int MakeLowerCase(int ch) { if (ch < 'A' || ch > 'Z') return ch; else return ch - 'A' + 'a'; } static void GetTextSegment(Accessor &styler, unsigned int start, unsigned int end, char *s, size_t len) { size_t i = 0; for (; (i < end - start + 1) && (i < len-1); i++) { s[i] = static_cast(MakeLowerCase(styler[start + i])); } s[i] = '\0'; } static script_type segIsScriptingIndicator(Accessor &styler, unsigned int start, unsigned int end, script_type prevValue) { char s[100]; GetTextSegment(styler, start, end, s, sizeof(s)); //Platform::DebugPrintf("Scripting indicator [%s]\n", s); if (strstr(s, "src")) // External script return eScriptNone; if (strstr(s, "vbs")) return eScriptVBS; if (strstr(s, "pyth")) return eScriptPython; if (strstr(s, "javas")) return eScriptJS; if (strstr(s, "jscr")) return eScriptJS; if (strstr(s, "php")) return eScriptPHP; if (strstr(s, "xml")) return eScriptXML; return prevValue; } static int PrintScriptingIndicatorOffset(Accessor &styler, unsigned int start, unsigned int end) { int iResult = 0; char s[100]; GetTextSegment(styler, start, end, s, sizeof(s)); if (0 == strncmp(s, "php", 3)) { iResult = 3; } return iResult; } static script_type ScriptOfState(int state) { if ((state >= SCE_HP_START) && (state <= SCE_HP_IDENTIFIER)) { return eScriptPython; } else if ((state >= SCE_HB_START) && (state <= SCE_HB_STRINGEOL)) { return eScriptVBS; } else if ((state >= SCE_HJ_START) && (state <= SCE_HJ_REGEX)) { return eScriptJS; } else if ((state >= SCE_HPHP_DEFAULT) && (state <= SCE_HPHP_COMMENTLINE)) { return eScriptPHP; } else if ((state >= SCE_H_SGML_DEFAULT) && (state < SCE_H_SGML_BLOCK_DEFAULT)) { return eScriptSGML; } else if (state == SCE_H_SGML_BLOCK_DEFAULT) { return eScriptSGMLblock; } else { return eScriptNone; } } static int statePrintForState(int state, script_mode inScriptType) { int StateToPrint; if ((state >= SCE_HP_START) && (state <= SCE_HP_IDENTIFIER)) { StateToPrint = state + ((inScriptType == eNonHtmlScript) ? 0 : SCE_HA_PYTHON); } else if ((state >= SCE_HB_START) && (state <= SCE_HB_STRINGEOL)) { StateToPrint = state + ((inScriptType == eNonHtmlScript) ? 0 : SCE_HA_VBS); } else if ((state >= SCE_HJ_START) && (state <= SCE_HJ_REGEX)) { StateToPrint = state + ((inScriptType == eNonHtmlScript) ? 0 : SCE_HA_JS); } else { StateToPrint = state; } return StateToPrint; } static int stateForPrintState(int StateToPrint) { int state; if ((StateToPrint >= SCE_HPA_START) && (StateToPrint <= SCE_HPA_IDENTIFIER)) { state = StateToPrint - SCE_HA_PYTHON; } else if ((StateToPrint >= SCE_HBA_START) && (StateToPrint <= SCE_HBA_STRINGEOL)) { state = StateToPrint - SCE_HA_VBS; } else if ((StateToPrint >= SCE_HJA_START) && (StateToPrint <= SCE_HJA_REGEX)) { state = StateToPrint - SCE_HA_JS; } else { state = StateToPrint; } return state; } static inline bool IsNumber(unsigned int start, Accessor &styler) { return IsADigit(styler[start]) || (styler[start] == '.') || (styler[start] == '-') || (styler[start] == '#'); } static inline bool isStringState(int state) { bool bResult; switch (state) { case SCE_HJ_DOUBLESTRING: case SCE_HJ_SINGLESTRING: case SCE_HJA_DOUBLESTRING: case SCE_HJA_SINGLESTRING: case SCE_HB_STRING: case SCE_HBA_STRING: case SCE_HP_STRING: case SCE_HP_CHARACTER: case SCE_HP_TRIPLE: case SCE_HP_TRIPLEDOUBLE: case SCE_HPA_STRING: case SCE_HPA_CHARACTER: case SCE_HPA_TRIPLE: case SCE_HPA_TRIPLEDOUBLE: case SCE_HPHP_HSTRING: case SCE_HPHP_SIMPLESTRING: case SCE_HPHP_HSTRING_VARIABLE: case SCE_HPHP_COMPLEX_VARIABLE: bResult = true; break; default : bResult = false; break; } return bResult; } static inline bool stateAllowsTermination(int state) { bool allowTermination = !isStringState(state); if (allowTermination) { switch (state) { case SCE_HPHP_COMMENT: case SCE_HP_COMMENTLINE: case SCE_HPA_COMMENTLINE: allowTermination = false; } } return allowTermination; } // not really well done, since it's only comments that should lex the %> and <% static inline bool isCommentASPState(int state) { bool bResult; switch (state) { case SCE_HJ_COMMENT: case SCE_HJ_COMMENTLINE: case SCE_HJ_COMMENTDOC: case SCE_HB_COMMENTLINE: case SCE_HP_COMMENTLINE: case SCE_HPHP_COMMENT: case SCE_HPHP_COMMENTLINE: bResult = true; break; default : bResult = false; break; } return bResult; } static void classifyAttribHTML(unsigned int start, unsigned int end, WordList &keywords, Accessor &styler) { bool wordIsNumber = IsNumber(start, styler); char chAttr = SCE_H_ATTRIBUTEUNKNOWN; if (wordIsNumber) { chAttr = SCE_H_NUMBER; } else { char s[100]; GetTextSegment(styler, start, end, s, sizeof(s)); if (keywords.InList(s)) chAttr = SCE_H_ATTRIBUTE; } if ((chAttr == SCE_H_ATTRIBUTEUNKNOWN) && !keywords) // No keywords -> all are known chAttr = SCE_H_ATTRIBUTE; styler.ColourTo(end, chAttr); } static int classifyTagHTML(unsigned int start, unsigned int end, WordList &keywords, Accessor &styler, bool &tagDontFold, bool caseSensitive) { char s[30 + 2]; // Copy after the '<' unsigned int i = 0; for (unsigned int cPos = start; cPos <= end && i < 30; cPos++) { char ch = styler[cPos]; if ((ch != '<') && (ch != '/')) { s[i++] = caseSensitive ? ch : static_cast(MakeLowerCase(ch)); } } //The following is only a quick hack, to see if this whole thing would work //we first need the tagname with a trailing space... s[i] = ' '; s[i+1] = '\0'; //...to find it in the list of no-container-tags // (There are many more. We will need a keywordlist in the property file for this) tagDontFold = (NULL != strstr("meta link img area br hr input ",s)); //now we can remove the trailing space s[i] = '\0'; bool isScript = false; char chAttr = SCE_H_TAGUNKNOWN; if (s[0] == '!') { chAttr = SCE_H_SGML_DEFAULT; } else if (s[0] == '/') { // Closing tag if (keywords.InList(s + 1)) chAttr = SCE_H_TAG; } else { if (keywords.InList(s)) { chAttr = SCE_H_TAG; isScript = 0 == strcmp(s, "script"); } } if ((chAttr == SCE_H_TAGUNKNOWN) && !keywords) { // No keywords -> all are known chAttr = SCE_H_TAG; isScript = 0 == strcmp(s, "script"); } styler.ColourTo(end, chAttr); return isScript ? SCE_H_SCRIPT : chAttr; } static void classifyWordHTJS(unsigned int start, unsigned int end, WordList &keywords, Accessor &styler, script_mode inScriptType) { char chAttr = SCE_HJ_WORD; bool wordIsNumber = IsADigit(styler[start]) || (styler[start] == '.'); if (wordIsNumber) chAttr = SCE_HJ_NUMBER; else { char s[30 + 1]; unsigned int i = 0; for (; i < end - start + 1 && i < 30; i++) { s[i] = styler[start + i]; } s[i] = '\0'; if (keywords.InList(s)) chAttr = SCE_HJ_KEYWORD; } styler.ColourTo(end, statePrintForState(chAttr, inScriptType)); } static int classifyWordHTVB(unsigned int start, unsigned int end, WordList &keywords, Accessor &styler, script_mode inScriptType) { char chAttr = SCE_HB_IDENTIFIER; bool wordIsNumber = IsADigit(styler[start]) || (styler[start] == '.'); if (wordIsNumber) chAttr = SCE_HB_NUMBER; else { char s[100]; GetTextSegment(styler, start, end, s, sizeof(s)); if (keywords.InList(s)) { chAttr = SCE_HB_WORD; if (strcmp(s, "rem") == 0) chAttr = SCE_HB_COMMENTLINE; } } styler.ColourTo(end, statePrintForState(chAttr, inScriptType)); if (chAttr == SCE_HB_COMMENTLINE) return SCE_HB_COMMENTLINE; else return SCE_HB_DEFAULT; } static void classifyWordHTPy(unsigned int start, unsigned int end, WordList &keywords, Accessor &styler, char *prevWord, script_mode inScriptType) { bool wordIsNumber = IsADigit(styler[start]); char s[30 + 1]; unsigned int i = 0; for (; i < end - start + 1 && i < 30; i++) { s[i] = styler[start + i]; } s[i] = '\0'; char chAttr = SCE_HP_IDENTIFIER; if (0 == strcmp(prevWord, "class")) chAttr = SCE_HP_CLASSNAME; else if (0 == strcmp(prevWord, "def")) chAttr = SCE_HP_DEFNAME; else if (wordIsNumber) chAttr = SCE_HP_NUMBER; else if (keywords.InList(s)) chAttr = SCE_HP_WORD; styler.ColourTo(end, statePrintForState(chAttr, inScriptType)); strcpy(prevWord, s); } // Update the word colour to default or keyword // Called when in a PHP word static void classifyWordHTPHP(unsigned int start, unsigned int end, WordList &keywords, Accessor &styler) { char chAttr = SCE_HPHP_DEFAULT; bool wordIsNumber = IsADigit(styler[start]) || (styler[start] == '.' && start+1 <= end && IsADigit(styler[start+1])); if (wordIsNumber) chAttr = SCE_HPHP_NUMBER; else { char s[100]; GetTextSegment(styler, start, end, s, sizeof(s)); if (keywords.InList(s)) chAttr = SCE_HPHP_WORD; } styler.ColourTo(end, chAttr); } static bool isWordHSGML(unsigned int start, unsigned int end, WordList &keywords, Accessor &styler) { char s[30 + 1]; unsigned int i = 0; for (; i < end - start + 1 && i < 30; i++) { s[i] = styler[start + i]; } s[i] = '\0'; return keywords.InList(s); } static bool isWordCdata(unsigned int start, unsigned int end, Accessor &styler) { char s[30 + 1]; unsigned int i = 0; for (; i < end - start + 1 && i < 30; i++) { s[i] = styler[start + i]; } s[i] = '\0'; return (0 == strcmp(s, "[CDATA[")); } // Return the first state to reach when entering a scripting language static int StateForScript(script_type scriptLanguage) { int Result; switch (scriptLanguage) { case eScriptVBS: Result = SCE_HB_START; break; case eScriptPython: Result = SCE_HP_START; break; case eScriptPHP: Result = SCE_HPHP_DEFAULT; break; case eScriptXML: Result = SCE_H_TAGUNKNOWN; break; case eScriptSGML: Result = SCE_H_SGML_DEFAULT; break; default : Result = SCE_HJ_START; break; } return Result; } static inline bool ishtmlwordchar(char ch) { return !isascii(ch) || (isalnum(ch) || ch == '.' || ch == '-' || ch == '_' || ch == ':' || ch == '!' || ch == '#'); } static inline bool issgmlwordchar(char ch) { return !isascii(ch) || (isalnum(ch) || ch == '.' || ch == '_' || ch == ':' || ch == '!' || ch == '#' || ch == '['); } static inline bool IsPhpWordStart(const unsigned char ch) { return (isascii(ch) && (isalpha(ch) || (ch == '_'))) || (ch >= 0x7f); } static inline bool IsPhpWordChar(char ch) { return IsADigit(ch) || IsPhpWordStart(ch); } static bool InTagState(int state) { return state == SCE_H_TAG || state == SCE_H_TAGUNKNOWN || state == SCE_H_SCRIPT || state == SCE_H_ATTRIBUTE || state == SCE_H_ATTRIBUTEUNKNOWN || state == SCE_H_NUMBER || state == SCE_H_OTHER || state == SCE_H_DOUBLESTRING || state == SCE_H_SINGLESTRING; } static bool IsCommentState(const int state) { return state == SCE_H_COMMENT || state == SCE_H_SGML_COMMENT; } static bool IsScriptCommentState(const int state) { return state == SCE_HJ_COMMENT || state == SCE_HJ_COMMENTLINE || state == SCE_HJA_COMMENT || state == SCE_HJA_COMMENTLINE || state == SCE_HB_COMMENTLINE || state == SCE_HBA_COMMENTLINE; } static bool isLineEnd(char ch) { return ch == '\r' || ch == '\n'; } static bool isOKBeforeRE(char ch) { return (ch == '(') || (ch == '=') || (ch == ','); } static bool isPHPStringState(int state) { return (state == SCE_HPHP_HSTRING) || (state == SCE_HPHP_SIMPLESTRING) || (state == SCE_HPHP_HSTRING_VARIABLE) || (state == SCE_HPHP_COMPLEX_VARIABLE); } static int FindPhpStringDelimiter(char *phpStringDelimiter, const int phpStringDelimiterSize, int i, const int lengthDoc, Accessor &styler) { int j; while (i < lengthDoc && (styler[i] == ' ' || styler[i] == '\t')) i++; phpStringDelimiter[0] = '\n'; for (j = i; j < lengthDoc && styler[j] != '\n' && styler[j] != '\r'; j++) { if (j - i < phpStringDelimiterSize - 2) phpStringDelimiter[j-i+1] = styler[j]; else i++; } phpStringDelimiter[j-i+1] = '\0'; return j; } static void ColouriseHyperTextDoc(unsigned int startPos, int length, int initStyle, WordList *keywordlists[], Accessor &styler) { WordList &keywords = *keywordlists[0]; WordList &keywords2 = *keywordlists[1]; WordList &keywords3 = *keywordlists[2]; WordList &keywords4 = *keywordlists[3]; WordList &keywords5 = *keywordlists[4]; WordList &keywords6 = *keywordlists[5]; // SGML (DTD) keywords // Lexer for HTML requires more lexical states (7 bits worth) than most lexers styler.StartAt(startPos, STYLE_MAX); char prevWord[200]; prevWord[0] = '\0'; char phpStringDelimiter[200]; // PHP is not limited in length, we are phpStringDelimiter[0] = '\0'; int StateToPrint = initStyle; int state = stateForPrintState(StateToPrint); // If inside a tag, it may be a script tag, so reread from the start to ensure any language tags are seen if (InTagState(state)) { while ((startPos > 0) && (InTagState(styler.StyleAt(startPos - 1)))) { startPos--; length++; } state = SCE_H_DEFAULT; } // String can be heredoc, must find a delimiter first while (startPos > 0 && isPHPStringState(state) && state != SCE_HPHP_SIMPLESTRING) { startPos--; length++; state = styler.StyleAt(startPos); } styler.StartAt(startPos, STYLE_MAX); int lineCurrent = styler.GetLine(startPos); int lineState; if (lineCurrent > 0) { lineState = styler.GetLineState(lineCurrent); } else { // Default client and ASP scripting language is JavaScript lineState = eScriptJS << 8; lineState |= styler.GetPropertyInt("asp.default.language", eScriptJS) << 4; } script_mode inScriptType = script_mode((lineState >> 0) & 0x03); // 2 bits of scripting mode bool tagOpened = (lineState >> 2) & 0x01; // 1 bit to know if we are in an opened tag bool tagClosing = (lineState >> 3) & 0x01; // 1 bit to know if we are in a closing tag bool tagDontFold = false; //some HTML tags should not be folded script_type aspScript = script_type((lineState >> 4) & 0x0F); // 4 bits of script name script_type clientScript = script_type((lineState >> 8) & 0x0F); // 4 bits of script name int beforePreProc = (lineState >> 12) & 0xFF; // 8 bits of state script_type scriptLanguage = ScriptOfState(state); const bool foldHTML = styler.GetPropertyInt("fold.html", 0) != 0; const bool fold = foldHTML && styler.GetPropertyInt("fold", 0); const bool foldHTMLPreprocessor = foldHTML && styler.GetPropertyInt("fold.html.preprocessor", 1); const bool foldCompact = styler.GetPropertyInt("fold.compact", 1) != 0; const bool caseSensitive = styler.GetPropertyInt("html.tags.case.sensitive", 0) != 0; int levelPrev = styler.LevelAt(lineCurrent) & SC_FOLDLEVELNUMBERMASK; int levelCurrent = levelPrev; int visibleChars = 0; char chPrev = ' '; char ch = ' '; char chPrevNonWhite = ' '; // look back to set chPrevNonWhite properly for better regex colouring if (scriptLanguage == eScriptJS && startPos > 0) { int back = startPos; int style = 0; while (--back) { style = styler.StyleAt(back); if (style < SCE_HJ_DEFAULT || style > SCE_HJ_COMMENTDOC) // includes SCE_HJ_COMMENT & SCE_HJ_COMMENTLINE break; } if (style == SCE_HJ_SYMBOLS) { chPrevNonWhite = styler.SafeGetCharAt(back); } } styler.StartSegment(startPos); const int lengthDoc = startPos + length; for (int i = startPos; i < lengthDoc; i++) { const char chPrev2 = chPrev; chPrev = ch; if (!isspacechar(ch) && state != SCE_HJ_COMMENT && state != SCE_HJ_COMMENTLINE && state != SCE_HJ_COMMENTDOC) chPrevNonWhite = ch; ch = styler[i]; char chNext = styler.SafeGetCharAt(i + 1); const char chNext2 = styler.SafeGetCharAt(i + 2); // Handle DBCS codepages if (styler.IsLeadByte(ch)) { chPrev = ' '; i += 1; continue; } if ((!isspacechar(ch) || !foldCompact) && fold) visibleChars++; // decide what is the current state to print (depending of the script tag) StateToPrint = statePrintForState(state, inScriptType); // handle script folding if (fold) { switch (scriptLanguage) { case eScriptJS: case eScriptPHP: //not currently supported case eScriptVBS: if ((state != SCE_HPHP_COMMENT) && (state != SCE_HPHP_COMMENTLINE) && (state != SCE_HJ_COMMENT) && (state != SCE_HJ_COMMENTLINE) && (state != SCE_HJ_COMMENTDOC) && (!isStringState(state))) { //Platform::DebugPrintf("state=%d, StateToPrint=%d, initStyle=%d\n", state, StateToPrint, initStyle); //if ((state == SCE_HPHP_OPERATOR) || (state == SCE_HPHP_DEFAULT) || (state == SCE_HJ_SYMBOLS) || (state == SCE_HJ_START) || (state == SCE_HJ_DEFAULT)) { if ((ch == '{') || (ch == '}')) { levelCurrent += (ch == '{') ? 1 : -1; } } break; case eScriptPython: if (state != SCE_HP_COMMENTLINE) { if ((ch == ':') && ((chNext == '\n') || (chNext == '\r' && chNext2 == '\n'))) { levelCurrent++; } else if ((ch == '\n') && !((chNext == '\r') && (chNext2 == '\n')) && (chNext != '\n')) { // check if the number of tabs is lower than the level int Findlevel = (levelCurrent & ~SC_FOLDLEVELBASE) * 8; for (int j = 0; Findlevel > 0; j++) { char chTmp = styler.SafeGetCharAt(i + j + 1); if (chTmp == '\t') { Findlevel -= 8; } else if (chTmp == ' ') { Findlevel--; } else { break; } } if (Findlevel > 0) { levelCurrent -= Findlevel / 8; if (Findlevel % 8) levelCurrent--; } } } break; default: break; } } if ((ch == '\r' && chNext != '\n') || (ch == '\n')) { // Trigger on CR only (Mac style) or either on LF from CR+LF (Dos/Win) or on LF alone (Unix) // Avoid triggering two times on Dos/Win // New line -> record any line state onto /next/ line if (fold) { int lev = levelPrev; if (visibleChars == 0) lev |= SC_FOLDLEVELWHITEFLAG; if ((levelCurrent > levelPrev) && (visibleChars > 0)) lev |= SC_FOLDLEVELHEADERFLAG; styler.SetLevel(lineCurrent, lev); visibleChars = 0; levelPrev = levelCurrent; } lineCurrent++; styler.SetLineState(lineCurrent, ((inScriptType & 0x03) << 0) | ((tagOpened & 0x01) << 2) | ((tagClosing & 0x01) << 3) | ((aspScript & 0x0F) << 4) | ((clientScript & 0x0F) << 8) | ((beforePreProc & 0xFF) << 12)); } // generic end of script processing else if ((inScriptType == eNonHtmlScript) && (ch == '<') && (chNext == '/')) { // Check if it's the end of the script tag (or any other HTML tag) switch (state) { // in these cases, you can embed HTML tags (to confirm !!!!!!!!!!!!!!!!!!!!!!) case SCE_H_DOUBLESTRING: case SCE_H_SINGLESTRING: case SCE_HJ_COMMENT: case SCE_HJ_COMMENTDOC: //case SCE_HJ_COMMENTLINE: // removed as this is a common thing done to hide // the end of script marker from some JS interpreters. case SCE_HJ_DOUBLESTRING: case SCE_HJ_SINGLESTRING: case SCE_HJ_REGEX: case SCE_HB_STRING: case SCE_HP_STRING: case SCE_HP_TRIPLE: case SCE_HP_TRIPLEDOUBLE: break; default : // check if the closing tag is a script tag if (state == SCE_HJ_COMMENTLINE) { char tag[7]; // room for the