// Scintilla source code edit control /** @file LexBash.cxx ** Lexer for Bash. **/ // Copyright 2004-2005 by Neil Hodgson // Adapted from LexPerl by Kein-Hong Man 2004 // The License.txt file describes the conditions under which this software may be distributed. #include #include #include #include #include #include "Platform.h" #include "PropSet.h" #include "Accessor.h" #include "KeyWords.h" #include "Scintilla.h" #include "SciLexer.h" #define BASH_BASE_ERROR 65 #define BASH_BASE_DECIMAL 66 #define BASH_BASE_HEX 67 #define BASH_BASE_OCTAL 68 #define BASH_BASE_OCTAL_ERROR 69 #define HERE_DELIM_MAX 256 static inline int translateBashDigit(char ch) { if (ch >= '0' && ch <= '9') { return ch - '0'; } else if (ch >= 'a' && ch <= 'z') { return ch - 'a' + 10; } else if (ch >= 'A' && ch <= 'Z') { return ch - 'A' + 36; } else if (ch == '@') { return 62; } else if (ch == '_') { return 63; } return BASH_BASE_ERROR; } static inline bool isEOLChar(char ch) { return (ch == '\r') || (ch == '\n'); } static bool isSingleCharOp(char ch) { char strCharSet[2]; strCharSet[0] = ch; strCharSet[1] = '\0'; return (NULL != strstr("rwxoRWXOezsfdlpSbctugkTBMACahGLNn", strCharSet)); } static inline bool isBashOperator(char ch) { if (ch == '^' || ch == '&' || ch == '\\' || ch == '%' || ch == '(' || ch == ')' || ch == '-' || ch == '+' || ch == '=' || ch == '|' || ch == '{' || ch == '}' || ch == '[' || ch == ']' || ch == ':' || ch == ';' || ch == '>' || ch == ',' || ch == '/' || ch == '<' || ch == '?' || ch == '!' || ch == '.' || ch == '~' || ch == '@') return true; return false; } static int classifyWordBash(unsigned int start, unsigned int end, WordList &keywords, Accessor &styler) { char s[100]; for (unsigned int i = 0; i < end - start + 1 && i < 30; i++) { s[i] = styler[start + i]; s[i + 1] = '\0'; } char chAttr = SCE_SH_IDENTIFIER; if (keywords.InList(s)) chAttr = SCE_SH_WORD; styler.ColourTo(end, chAttr); return chAttr; } static inline int getBashNumberBase(unsigned int start, unsigned int end, Accessor &styler) { int base = 0; for (unsigned int i = 0; i < end - start + 1 && i < 10; i++) { base = base * 10 + (styler[start + i] - '0'); } if (base > 64 || (end - start) > 1) { return BASH_BASE_ERROR; } return base; } static inline bool isEndVar(char ch) { return !isalnum(ch) && ch != '$' && ch != '_'; } static inline bool isNonQuote(char ch) { return isalnum(ch) || ch == '_'; } static bool isMatch(Accessor &styler, int lengthDoc, int pos, const char *val) { if ((pos + static_cast(strlen(val))) >= lengthDoc) { return false; } while (*val) { if (*val != styler[pos++]) { return false; } val++; } return true; } static char opposite(char ch) { if (ch == '(') return ')'; if (ch == '[') return ']'; if (ch == '{') return '}'; if (ch == '<') return '>'; return ch; } static void ColouriseBashDoc(unsigned int startPos, int length, int initStyle, WordList *keywordlists[], Accessor &styler) { // Lexer for bash often has to backtrack to start of current style to determine // which characters are being used as quotes, how deeply nested is the // start position and what the termination string is for here documents WordList &keywords = *keywordlists[0]; class HereDocCls { public: int State; // 0: '<<' encountered // 1: collect the delimiter // 2: here doc text (lines after the delimiter) char Quote; // the char after '<<' bool Quoted; // true if Quote in ('\'','"','`') bool Indent; // indented delimiter (for <<-) int DelimiterLength; // strlen(Delimiter) char *Delimiter; // the Delimiter, 256: sizeof PL_tokenbuf HereDocCls() { State = 0; Quote = 0; Quoted = false; Indent = 0; DelimiterLength = 0; Delimiter = new char[HERE_DELIM_MAX]; Delimiter[0] = '\0'; } ~HereDocCls() { delete []Delimiter; } }; HereDocCls HereDoc; class QuoteCls { public: int Rep; int Count; char Up; char Down; QuoteCls() { this->New(1); } void New(int r) { Rep = r; Count = 0; Up = '\0'; Down = '\0'; } void Open(char u) { Count++; Up = u; Down = opposite(Up); } }; QuoteCls Quote; int state = initStyle; int numBase = 0; unsigned int lengthDoc = startPos + length; // If in a long distance lexical state, seek to the beginning to find quote characters // Bash strings can be multi-line with embedded newlines, so backtrack. // Bash numbers have additional state during lexing, so backtrack too. if (state == SCE_SH_HERE_Q) { while ((startPos > 1) && (styler.StyleAt(startPos) != SCE_SH_HERE_DELIM)) { startPos--; } startPos = styler.LineStart(styler.GetLine(startPos)); state = styler.StyleAt(startPos - 1); } if (state == SCE_SH_STRING || state == SCE_SH_BACKTICKS || state == SCE_SH_CHARACTER || state == SCE_SH_NUMBER || state == SCE_SH_IDENTIFIER || state == SCE_SH_COMMENTLINE ) { while ((startPos > 1) && (styler.StyleAt(startPos - 1) == state)) { startPos--; } state = SCE_SH_DEFAULT; } styler.StartAt(startPos); char chPrev = styler.SafeGetCharAt(startPos - 1); if (startPos == 0) chPrev = '\n'; char chNext = styler[startPos]; styler.StartSegment(startPos); for (unsigned int i = startPos; i < lengthDoc; i++) { char ch = chNext; // if the current character is not consumed due to the completion of an // earlier style, lexing can be restarted via a simple goto restartLexer: chNext = styler.SafeGetCharAt(i + 1); char chNext2 = styler.SafeGetCharAt(i + 2); if (styler.IsLeadByte(ch)) { chNext = styler.SafeGetCharAt(i + 2); chPrev = ' '; i += 1; continue; } if ((chPrev == '\r' && ch == '\n')) { // skip on DOS/Windows styler.ColourTo(i, state); chPrev = ch; continue; } if (HereDoc.State == 1 && isEOLChar(ch)) { // Begin of here-doc (the line after the here-doc delimiter): // Lexically, the here-doc starts from the next line after the >>, but the // first line of here-doc seem to follow the style of the last EOL sequence HereDoc.State = 2; if (HereDoc.Quoted) { if (state == SCE_SH_HERE_DELIM) { // Missing quote at end of string! We are stricter than bash. // Colour here-doc anyway while marking this bit as an error. state = SCE_SH_ERROR; } styler.ColourTo(i - 1, state); // HereDoc.Quote always == '\'' state = SCE_SH_HERE_Q; } else { styler.ColourTo(i - 1, state); // always switch state = SCE_SH_HERE_Q; } } if (state == SCE_SH_DEFAULT) { if (ch == '\\') { // escaped character if (i < lengthDoc - 1) i++; ch = chNext; chNext = chNext2; styler.ColourTo(i, SCE_SH_IDENTIFIER); } else if (isdigit(ch)) { state = SCE_SH_NUMBER; numBase = BASH_BASE_DECIMAL; if (ch == '0') { // hex,octal if (chNext == 'x' || chNext == 'X') { numBase = BASH_BASE_HEX; i++; ch = chNext; chNext = chNext2; } else if (isdigit(chNext)) { numBase = BASH_BASE_OCTAL; } } } else if (iswordstart(ch)) { state = SCE_SH_WORD; if (!iswordchar(chNext) && chNext != '+' && chNext != '-') { // We need that if length of word == 1! // This test is copied from the SCE_SH_WORD handler. classifyWordBash(styler.GetStartSegment(), i, keywords, styler); state = SCE_SH_DEFAULT; } } else if (ch == '#') { state = SCE_SH_COMMENTLINE; } else if (ch == '\"') { state = SCE_SH_STRING; Quote.New(1); Quote.Open(ch); } else if (ch == '\'') { state = SCE_SH_CHARACTER; Quote.New(1); Quote.Open(ch); } else if (ch == '`') { state = SCE_SH_BACKTICKS; Quote.New(1); Quote.Open(ch); } else if (ch == '$') { if (chNext == '{') { state = SCE_SH_PARAM; goto startQuote; } else if (chNext == '\'') { state = SCE_SH_CHARACTER; goto startQuote; } else if (chNext == '"') { state = SCE_SH_STRING; goto startQuote; } else if (chNext == '(' && chNext2 == '(') { styler.ColourTo(i, SCE_SH_OPERATOR); state = SCE_SH_DEFAULT; goto skipChar; } else if (chNext == '(' || chNext == '`') { state = SCE_SH_BACKTICKS; startQuote: Quote.New(1); Quote.Open(chNext); goto skipChar; } else { state = SCE_SH_SCALAR; skipChar: i++; ch = chNext; chNext = chNext2; } } else if (ch == '*') { if (chNext == '*') { // exponentiation i++; ch = chNext; chNext = chNext2; } styler.ColourTo(i, SCE_SH_OPERATOR); } else if (ch == '<' && chNext == '<') { state = SCE_SH_HERE_DELIM; HereDoc.State = 0; HereDoc.Indent = false; } else if (ch == '-' // file test operators && isSingleCharOp(chNext) && !isalnum((chNext2 = styler.SafeGetCharAt(i+2)))) { styler.ColourTo(i + 1, SCE_SH_WORD); state = SCE_SH_DEFAULT; i++; ch = chNext; chNext = chNext2; } else if (isBashOperator(ch)) { styler.ColourTo(i, SCE_SH_OPERATOR); } else { // keep colouring defaults to make restart easier styler.ColourTo(i, SCE_SH_DEFAULT); } } else if (state == SCE_SH_NUMBER) { int digit = translateBashDigit(ch); if (numBase == BASH_BASE_DECIMAL) { if (ch == '#') { numBase = getBashNumberBase(styler.GetStartSegment(), i - 1, styler); if (numBase == BASH_BASE_ERROR) // take the rest as comment goto numAtEnd; } else if (!isdigit(ch)) goto numAtEnd; } else if (numBase == BASH_BASE_HEX) { if ((digit < 16) || (digit >= 36 && digit <= 41)) { // hex digit 0-9a-fA-F } else goto numAtEnd; } else if (numBase == BASH_BASE_OCTAL || numBase == BASH_BASE_OCTAL_ERROR) { if (digit > 7) { if (digit <= 9) { numBase = BASH_BASE_OCTAL_ERROR; } else goto numAtEnd; } } else if (numBase == BASH_BASE_ERROR) { if (digit > 9) goto numAtEnd; } else { // DD#DDDD number style handling if (digit != BASH_BASE_ERROR) { if (numBase <= 36) { // case-insensitive if base<=36 if (digit >= 36) digit -= 26; } if (digit >= numBase) { if (digit <= 9) { numBase = BASH_BASE_ERROR; } else goto numAtEnd; } } else { numAtEnd: if (numBase == BASH_BASE_ERROR || numBase == BASH_BASE_OCTAL_ERROR) state = SCE_SH_ERROR; styler.ColourTo(i - 1, state); state = SCE_SH_DEFAULT; goto restartLexer; } } } else if (state == SCE_SH_WORD) { if (!iswordchar(chNext) && chNext != '+' && chNext != '-') { // "." never used in Bash variable names // but used in file names classifyWordBash(styler.GetStartSegment(), i, keywords, styler); state = SCE_SH_DEFAULT; ch = ' '; } } else if (state == SCE_SH_IDENTIFIER) { if (!iswordchar(chNext) && chNext != '+' && chNext != '-') { styler.ColourTo(i, SCE_SH_IDENTIFIER); state = SCE_SH_DEFAULT; ch = ' '; } } else { if (state == SCE_SH_COMMENTLINE) { if (ch == '\\' && isEOLChar(chNext)) { // comment continuation if (chNext == '\r' && chNext2 == '\n') { i += 2; ch = styler.SafeGetCharAt(i); chNext = styler.SafeGetCharAt(i + 1); } else { i++; ch = chNext; chNext = chNext2; } } else if (isEOLChar(ch)) { styler.ColourTo(i - 1, state); state = SCE_SH_DEFAULT; goto restartLexer; } else if (isEOLChar(chNext)) { styler.ColourTo(i, state); state = SCE_SH_DEFAULT; } } else if (state == SCE_SH_HERE_DELIM) { // // From Bash info: // --------------- // Specifier format is: <<[-]WORD // Optional '-' is for removal of leading tabs from here-doc. // Whitespace acceptable after <<[-] operator // if (HereDoc.State == 0) { // '<<' encountered HereDoc.State = 1; HereDoc.Quote = chNext; HereDoc.Quoted = false; HereDoc.DelimiterLength = 0; HereDoc.Delimiter[HereDoc.DelimiterLength] = '\0'; if (chNext == '\'' || chNext == '\"') { // a quoted here-doc delimiter (' or ") i++; ch = chNext; chNext = chNext2; HereDoc.Quoted = true; } else if (!HereDoc.Indent && chNext == '-') { // <<- indent case HereDoc.Indent = true; HereDoc.State = 0; } else if (isalpha(chNext) || chNext == '_' || chNext == '\\' || chNext == '-' || chNext == '+' || chNext == '!') { // an unquoted here-doc delimiter, no special handling // TODO check what exactly bash considers part of the delim } else if (chNext == '<') { // HERE string <<< i++; ch = chNext; chNext = chNext2; styler.ColourTo(i, SCE_SH_HERE_DELIM); state = SCE_SH_DEFAULT; HereDoc.State = 0; } else if (isspacechar(chNext)) { // eat whitespace HereDoc.State = 0; } else if (isdigit(chNext) || chNext == '=' || chNext == '$') { // left shift << or <<= operator cases styler.ColourTo(i, SCE_SH_OPERATOR); state = SCE_SH_DEFAULT; HereDoc.State = 0; } else { // symbols terminates; deprecated zero-length delimiter } } else if (HereDoc.State == 1) { // collect the delimiter if (HereDoc.Quoted) { // a quoted here-doc delimiter if (ch == HereDoc.Quote) { // closing quote => end of delimiter styler.ColourTo(i, state); state = SCE_SH_DEFAULT; } else { if (ch == '\\' && chNext == HereDoc.Quote) { // escaped quote i++; ch = chNext; chNext = chNext2; } HereDoc.Delimiter[HereDoc.DelimiterLength++] = ch; HereDoc.Delimiter[HereDoc.DelimiterLength] = '\0'; } } else { // an unquoted here-doc delimiter if (isalnum(ch) || ch == '_' || ch == '-' || ch == '+' || ch == '!') { HereDoc.Delimiter[HereDoc.DelimiterLength++] = ch; HereDoc.Delimiter[HereDoc.DelimiterLength] = '\0'; } else if (ch == '\\') { // skip escape prefix } else { styler.ColourTo(i - 1, state); state = SCE_SH_DEFAULT; goto restartLexer; } } if (HereDoc.DelimiterLength >= HERE_DELIM_MAX - 1) { styler.ColourTo(i - 1, state); state = SCE_SH_ERROR; goto restartLexer; } } } else if (HereDoc.State == 2) { // state == SCE_SH_HERE_Q if (isMatch(styler, lengthDoc, i, HereDoc.Delimiter)) { if (!HereDoc.Indent && isEOLChar(chPrev)) { endHereDoc: // standard HERE delimiter i += HereDoc.DelimiterLength; chPrev = styler.SafeGetCharAt(i - 1); ch = styler.SafeGetCharAt(i); if (isEOLChar(ch)) { styler.ColourTo(i - 1, state); state = SCE_SH_DEFAULT; HereDoc.State = 0; goto restartLexer; } chNext = styler.SafeGetCharAt(i + 1); } else if (HereDoc.Indent) { // indented HERE delimiter unsigned int bk = (i > 0)? i - 1: 0; while (i > 0) { ch = styler.SafeGetCharAt(bk--); if (isEOLChar(ch)) { goto endHereDoc; } else if (!isspacechar(ch)) { break; // got leading non-whitespace } } } } } else if (state == SCE_SH_SCALAR) { // variable names if (isEndVar(ch)) { if ((state == SCE_SH_SCALAR) && i == (styler.GetStartSegment() + 1)) { // Special variable: $(, $_ etc. styler.ColourTo(i, state); state = SCE_SH_DEFAULT; } else { styler.ColourTo(i - 1, state); state = SCE_SH_DEFAULT; goto restartLexer; } } } else if (state == SCE_SH_STRING || state == SCE_SH_CHARACTER || state == SCE_SH_BACKTICKS || state == SCE_SH_PARAM ) { if (!Quote.Down && !isspacechar(ch)) { Quote.Open(ch); } else if (ch == '\\' && Quote.Up != '\\') { i++; ch = chNext; chNext = styler.SafeGetCharAt(i + 1); } else if (ch == Quote.Down) { Quote.Count--; if (Quote.Count == 0) { Quote.Rep--; if (Quote.Rep <= 0) { styler.ColourTo(i, state); state = SCE_SH_DEFAULT; ch = ' '; } if (Quote.Up == Quote.Down) { Quote.Count++; } } } else if (ch == Quote.Up) { Quote.Count++; } } } if (state == SCE_SH_ERROR) { break; } chPrev = ch; } styler.ColourTo(lengthDoc - 1, state); } static bool IsCommentLine(int line, Accessor &styler) { int pos = styler.LineStart(line); int eol_pos = styler.LineStart(line + 1) - 1; for (int i = pos; i < eol_pos; i++) { char ch = styler[i]; if (ch == '#') return true; else if (ch != ' ' && ch != '\t') return false; } return false; } static void FoldBashDoc(unsigned int startPos, int length, int, WordList *[], Accessor &styler) { bool foldComment = styler.GetPropertyInt("fold.comment") != 0; bool foldCompact = styler.GetPropertyInt("fold.compact", 1) != 0; unsigned int endPos = startPos + length; int visibleChars = 0; int lineCurrent = styler.GetLine(startPos); int levelPrev = styler.LevelAt(lineCurrent) & SC_FOLDLEVELNUMBERMASK; int levelCurrent = levelPrev; char chNext = styler[startPos]; int styleNext = styler.StyleAt(startPos); for (unsigned int i = startPos; i < endPos; i++) { char ch = chNext; chNext = styler.SafeGetCharAt(i + 1); int style = styleNext; styleNext = styler.StyleAt(i + 1); bool atEOL = (ch == '\r' && chNext != '\n') || (ch == '\n'); // Comment folding if (foldComment && atEOL && IsCommentLine(lineCurrent, styler)) { if (!IsCommentLine(lineCurrent - 1, styler) && IsCommentLine(lineCurrent + 1, styler)) levelCurrent++; else if (IsCommentLine(lineCurrent - 1, styler) && !IsCommentLine(lineCurrent+1, styler)) levelCurrent--; } if (style == SCE_SH_OPERATOR) { if (ch == '{') { levelCurrent++; } else if (ch == '}') { levelCurrent--; } } if (atEOL) { int lev = levelPrev; if (visibleChars == 0 && foldCompact) lev |= SC_FOLDLEVELWHITEFLAG; if ((levelCurrent > levelPrev) && (visibleChars > 0)) lev |= SC_FOLDLEVELHEADERFLAG; if (lev != styler.LevelAt(lineCurrent)) { styler.SetLevel(lineCurrent, lev); } lineCurrent++; levelPrev = levelCurrent; visibleChars = 0; } if (!isspacechar(ch)) visibleChars++; } // Fill in the real level of the next line, keeping the current flags as they will be filled in later int flagsNext = styler.LevelAt(lineCurrent) & ~SC_FOLDLEVELNUMBERMASK; styler.SetLevel(lineCurrent, levelPrev | flagsNext); } static const char * const bashWordListDesc[] = { "Keywords", 0 }; LexerModule lmBash(SCLEX_BASH, ColouriseBashDoc, "bash", FoldBashDoc, bashWordListDesc);