You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2947 lines
70 KiB

/**
* @file tokenize.cpp
* This file breaks up the text stream into tokens or chunks.
*
* Each routine needs to set pc.len and pc.type.
*
* @author Ben Gardner
* @license GPL v2+
*/
#include "tokenize.h"
#include "keywords.h"
#include "prototypes.h"
#include "punctuators.h"
#include "unc_ctype.h"
#include <regex>
#include <stack>
#define LE_COUNT(x) cpd.le_counts[static_cast<size_t>(LE_ ## x)]
constexpr static auto LCURRENT = LTOK;
using namespace std;
using namespace uncrustify;
struct TokenInfo
{
TokenInfo()
: last_ch(0)
, idx(0)
, row(1)
, col(1)
{
}
size_t last_ch;
size_t idx;
size_t row;
size_t col;
};
struct TokenContext
{
TokenContext(const deque<int> &d)
: data(d)
{
}
//! save before trying to parse something that may fail
void save()
{
save(s);
}
void save(TokenInfo &info)
{
info = c;
}
//! restore previous saved state
void restore()
{
restore(s);
}
void restore(const TokenInfo &info)
{
c = info;
}
bool more()
{
return(c.idx < data.size());
}
size_t peek()
{
return(more() ? data[c.idx] : 0);
}
size_t peek(size_t idx)
{
idx += c.idx;
return((idx < data.size()) ? data[idx] : 0);
}
size_t get()
{
if (more())
{
size_t ch = data[c.idx++];
switch (ch)
{
case '\t':
log_rule_B("input_tab_size");
c.col = calc_next_tab_column(c.col, options::input_tab_size());
break;
case '\n':
if (c.last_ch != '\r')
{
c.row++;
c.col = 1;
}
break;
case '\r':
c.row++;
c.col = 1;
break;
default:
c.col++;
break;
}
c.last_ch = ch;
return(ch);
}
return(0);
}
bool expect(size_t ch)
{
if (peek() == ch)
{
get();
return(true);
}
return(false);
}
const deque<int> &data;
TokenInfo c; //! current
TokenInfo s; //! saved
};
/**
* Count the number of characters in a quoted string.
* The next bit of text starts with a quote char " or ' or <.
* Count the number of characters until the matching character.
*
* @param pc The structure to update, str is an input.
*
* @return Whether a string was parsed
*/
static bool parse_string(TokenContext &ctx, Chunk &pc, size_t quote_idx, bool allow_escape);
/**
* Literal string, ends with single "
* Two "" don't end the string.
*
* @param pc The structure to update, str is an input.
*
* @return Whether a string was parsed
*/
static bool parse_cs_string(TokenContext &ctx, Chunk &pc);
/**
* VALA verbatim string, ends with three quotes (""")
*
* @param pc The structure to update, str is an input.
*/
static void parse_verbatim_string(TokenContext &ctx, Chunk &pc);
static bool tag_compare(const deque<int> &d, size_t a_idx, size_t b_idx, size_t len);
/**
* Parses a C++0x 'R' string. R"( xxx )" R"tag( )tag" u8R"(x)" uR"(x)"
* Newlines may be in the string.
*
* @param pc structure to update, str is an input.
*/
static bool parse_cr_string(TokenContext &ctx, Chunk &pc, size_t q_idx);
/**
* Count the number of whitespace characters.
*
* @param pc The structure to update, str is an input.
*
* @return Whether whitespace was parsed
*/
static bool parse_whitespace(TokenContext &ctx, Chunk &pc);
/**
* Called when we hit a backslash.
* If there is nothing but whitespace until the newline, then this is a
* backslash newline
*
* @param pc structure to update, str is an input
*/
static bool parse_bs_newline(TokenContext &ctx, Chunk &pc);
/**
* Parses any number of tab or space chars followed by a newline.
* Does not change pc.len if a newline isn't found.
* This is not the same as parse_whitespace() because it only consumes until
* a single newline is encountered.
*/
static bool parse_newline(TokenContext &ctx);
/**
* PAWN #define is different than C/C++.
* #define PATTERN REPLACEMENT_TEXT
* The PATTERN may not contain a space or '[' or ']'.
* A generic whitespace check should be good enough.
* Do not change the pattern.
*
* @param pc structure to update, str is an input
*/
static void parse_pawn_pattern(TokenContext &ctx, Chunk &pc, E_Token tt);
static bool parse_ignored(TokenContext &ctx, Chunk &pc);
/**
* Skips the next bit of whatever and returns the type of block.
*
* pc.str is the input text.
* pc.len in the output length.
* pc.type is the output type
* pc.column is output column
*
* @param pc The structure to update, str is an input.
* @param prev_pc The previous structure
*
* @return true/false - whether anything was parsed
*/
static bool parse_next(TokenContext &ctx, Chunk &pc, const Chunk *prev_pc);
/**
* Parses all legal D string constants.
*
* Quoted strings:
* r"Wysiwyg" # WYSIWYG string
* x"hexstring" # Hexadecimal array
* `Wysiwyg` # WYSIWYG string
* 'char' # single character
* "reg_string" # regular string
*
* Non-quoted strings:
* \x12 # 1-byte hex constant
* \u1234 # 2-byte hex constant
* \U12345678 # 4-byte hex constant
* \123 # octal constant
* \&amp; # named entity
* \n # single character
*
* @param pc The structure to update, str is an input.
*
* @return Whether a string was parsed
*/
static bool d_parse_string(TokenContext &ctx, Chunk &pc);
/**
* Figure of the length of the comment at text.
* The next bit of text starts with a '/', so it might be a comment.
* There are three types of comments:
* - C comments that start with '/ *' and end with '* /'
* - C++ comments that start with //
* - D nestable comments '/+' '+/'
*
* @param pc The structure to update, str is an input.
*
* @return Whether a comment was parsed
*/
static bool parse_comment(TokenContext &ctx, Chunk &pc);
/**
* Figure of the length of the code placeholder at text, if present.
* This is only for Xcode which sometimes inserts temporary code placeholder chunks, which in plaintext <#look like this#>.
*
* @param pc The structure to update, str is an input.
*
* @return Whether a placeholder was parsed.
*/
static bool parse_code_placeholder(TokenContext &ctx, Chunk &pc);
/**
* Parse any attached suffix, which may be a user-defined literal suffix.
* If for a string, explicitly exclude common format and scan specifiers, ie,
* PRIx32 and SCNx64.
*/
static void parse_suffix(TokenContext &ctx, Chunk &pc, bool forstring);
//! check if a symbol holds a boolean value
static bool is_bin(int ch);
static bool is_bin_(int ch);
//! check if a symbol holds a octal value
static bool is_oct(int ch);
static bool is_oct_(int ch);
//! check if a symbol holds a decimal value;
static bool is_dec(int ch);
static bool is_dec_(int ch);
//! check if a symbol holds a hexadecimal value
static bool is_hex(int ch);
static bool is_hex_(int ch);
/**
* Count the number of characters in the number.
* The next bit of text starts with a number (0-9 or '.'), so it is a number.
* Count the number of characters in the number.
*
* This should cover all number formats for all languages.
* Note that this is not a strict parser. It will happily parse numbers in
* an invalid format.
*
* For example, only D allows underscores in the numbers, but they are
* allowed in all formats.
*
* @param[in,out] pc The structure to update, str is an input.
*
* @return Whether a number was parsed
*/
static bool parse_number(TokenContext &ctx, Chunk &pc);
static bool d_parse_string(TokenContext &ctx, Chunk &pc)
{
size_t ch = ctx.peek();
if ( ch == '"' // 34
|| ch == '\'') // 39
{
return(parse_string(ctx, pc, 0, true));
}
if (ch == '`') // 96
{
return(parse_string(ctx, pc, 0, false));
}
if ( ( ch == 'r' // 114
|| ch == 'x') // 120
&& ctx.peek(1) == '"') // 34
{
return(parse_string(ctx, pc, 1, false));
}
if (ch != '\\')
{
return(false);
}
ctx.save();
int cnt;
pc.Str().clear();
while (ctx.peek() == '\\') // 92
{
pc.Str().append(ctx.get());
// Check for end of file
switch (ctx.peek())
{
case 'x': // \x HexDigit HexDigit
cnt = 3;
while (cnt--)
{
pc.Str().append(ctx.get());
}
break;
case 'u': // \u HexDigit (x4)
cnt = 5;
while (cnt--)
{
pc.Str().append(ctx.get());
}
break;
case 'U': // \U HexDigit (x8)
cnt = 9;
while (cnt--)
{
pc.Str().append(ctx.get());
}
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
// handle up to 3 octal digits
pc.Str().append(ctx.get());
ch = ctx.peek();
if ( (ch >= '0')
&& (ch <= '7'))
{
pc.Str().append(ctx.get());
ch = ctx.peek();
if ( (ch >= '0')
&& (ch <= '7'))
{
pc.Str().append(ctx.get());
}
}
break;
case '&':
// \& NamedCharacterEntity ;
pc.Str().append(ctx.get());
while (unc_isalpha(ctx.peek()))
{
pc.Str().append(ctx.get());
}
if (ctx.peek() == ';') // 59
{
pc.Str().append(ctx.get());
}
break;
default:
// Everything else is a single character
pc.Str().append(ctx.get());
break;
} // switch
}
if (pc.GetStr().size() < 1)
{
ctx.restore();
return(false);
}
pc.SetType(CT_STRING);
return(true);
} // d_parse_string
#if 0
//! A string-in-string search. Like strstr() with a haystack length.
static const char *str_search(const char *needle, const char *haystack, int haystack_len)
{
int needle_len = strlen(needle);
while (haystack_len-- >= needle_len)
{
if (memcmp(needle, haystack, needle_len) == 0)
{
return(haystack);
}
haystack++;
}
return(NULL);
}
#endif
static bool parse_comment(TokenContext &ctx, Chunk &pc)
{
bool is_d = language_is_set(LANG_D);
bool is_cs = language_is_set(LANG_CS);
size_t d_level = 0;
// does this start with '/ /' or '/ *' or '/ +' (d)
if ( (ctx.peek() != '/')
|| ( (ctx.peek(1) != '*')
&& (ctx.peek(1) != '/')
&& ( (ctx.peek(1) != '+')
|| !is_d)))
{
return(false);
}
ctx.save();
// account for opening two chars
pc.Str() = ctx.get(); // opening '/'
size_t ch = ctx.get();
pc.Str().append(ch); // second char
if (ch == '/') // 47
{
pc.SetType(CT_COMMENT_CPP);
while (true)
{
int bs_cnt = 0;
while (ctx.more())
{
ch = ctx.peek();
if ( (ch == '\r')
|| (ch == '\n'))
{
break;
}
if ( (ch == '\\') // 92
&& !is_cs) // backslashes aren't special in comments in C#
{
bs_cnt++;
}
else
{
bs_cnt = 0;
}
pc.Str().append(ctx.get());
}
/*
* If we hit an odd number of backslashes right before the newline,
* then we keep going.
*/
if ( ((bs_cnt & 1) == 0)
|| !ctx.more())
{
break;
}
if (ctx.peek() == '\r')
{
pc.Str().append(ctx.get());
}
if (ctx.peek() == '\n')
{
pc.Str().append(ctx.get());
}
pc.SetNlCount(pc.GetNlCount() + 1);
cpd.did_newline = true;
}
}
else if (!ctx.more())
{
// unexpected end of file
ctx.restore();
return(false);
}
else if (ch == '+') // 43
{
pc.SetType(CT_COMMENT);
d_level++;
while ( d_level > 0
&& ctx.more())
{
if ( (ctx.peek() == '+') // 43
&& (ctx.peek(1) == '/')) // 47
{
pc.Str().append(ctx.get()); // store the '+'
pc.Str().append(ctx.get()); // store the '/'
d_level--;
continue;
}
if ( (ctx.peek() == '/') // 47
&& (ctx.peek(1) == '+')) // 43
{
pc.Str().append(ctx.get()); // store the '/'
pc.Str().append(ctx.get()); // store the '+'
d_level++;
continue;
}
ch = ctx.get();
pc.Str().append(ch);
if ( (ch == '\n')
|| (ch == '\r'))
{
pc.SetType(CT_COMMENT_MULTI);
pc.SetNlCount(pc.GetNlCount() + 1);
if (ch == '\r')
{
if (ctx.peek() == '\n')
{
++LE_COUNT(CRLF);
pc.Str().append(ctx.get()); // store the '\n'
}
else
{
++LE_COUNT(CR);
}
}
else
{
++LE_COUNT(LF);
}
}
}
}
else // must be '/ *'
{
pc.SetType(CT_COMMENT);
while (ctx.more())
{
if ( (ctx.peek() == '*') // 43
&& (ctx.peek(1) == '/')) // 47
{
pc.Str().append(ctx.get()); // store the '*'
pc.Str().append(ctx.get()); // store the '/'
TokenInfo ss;
ctx.save(ss);
size_t oldsize = pc.GetStr().size();
// If there is another C comment right after this one, combine them
while ( (ctx.peek() == ' ') // 32
|| (ctx.peek() == '\t')) // tab
{
pc.Str().append(ctx.get());
}
if ( (ctx.peek() != '/')
|| (ctx.peek(1) != '*'))
{
// undo the attempt to join
ctx.restore(ss);
pc.Str().resize(oldsize);
break;
}
}
ch = ctx.get();
pc.Str().append(ch);
if ( (ch == '\n')
|| (ch == '\r'))
{
pc.SetType(CT_COMMENT_MULTI);
pc.SetNlCount(pc.GetNlCount() + 1);
if (ch == '\r')
{
if (ctx.peek() == '\n')
{
++LE_COUNT(CRLF);
pc.Str().append(ctx.get()); // store the '\n'
}
else
{
++LE_COUNT(CR);
}
}
else
{
++LE_COUNT(LF);
}
}
}
}
if (cpd.unc_off)
{
bool found_enable_marker = (find_enable_processing_comment_marker(pc.GetStr()) >= 0);
if (found_enable_marker)
{
const auto &ontext = options::enable_processing_cmt();
LOG_FMT(LBCTRL, "%s(%d): Found '%s' on line %zu\n",
__func__, __LINE__, ontext.c_str(), pc.GetOrigLine());
cpd.unc_off = false;
}
}
else
{
auto position_disable_processing_cmt = find_disable_processing_comment_marker(pc.GetStr());
bool found_disable_marker = (position_disable_processing_cmt >= 0);
if (found_disable_marker)
{
/**
* the user may wish to disable processing part of a multiline comment,
* in which case we'll handle at a late time. Check to see if processing
* is re-enabled elsewhere in this comment
*/
auto position_enable_processing_cmt = find_enable_processing_comment_marker(pc.GetStr());
if (position_enable_processing_cmt < position_disable_processing_cmt)
{
const auto &offtext = options::disable_processing_cmt();
LOG_FMT(LBCTRL, "%s(%d): Found '%s' on line %zu\n",
__func__, __LINE__, offtext.c_str(), pc.GetOrigLine());
cpd.unc_off = true;
// Issue #842
cpd.unc_off_used = true;
}
}
}
return(true);
} // parse_comment
static bool parse_code_placeholder(TokenContext &ctx, Chunk &pc)
{
if ( (ctx.peek() != '<')
|| (ctx.peek(1) != '#'))
{
return(false);
}
ctx.save();
// account for opening two chars '<#'
pc.Str() = ctx.get();
pc.Str().append(ctx.get());
// grab everything until '#>', fail if not found.
size_t last1 = 0;
while (ctx.more())
{
size_t last2 = last1;
last1 = ctx.get();
pc.Str().append(last1);
if ( (last2 == '#') // 35
&& (last1 == '>')) // 62
{
pc.SetType(CT_WORD);
return(true);
}
}
ctx.restore();
return(false);
}
static void parse_suffix(TokenContext &ctx, Chunk &pc, bool forstring = false)
{
if (CharTable::IsKw1(ctx.peek()))
{
size_t slen = 0;
size_t oldsize = pc.GetStr().size();
// don't add the suffix if we see L" or L' or S"
size_t p1 = ctx.peek();
size_t p2 = ctx.peek(1);
if ( forstring
&& ( ( (p1 == 'L') // 76
&& ( (p2 == '"') // 34
|| (p2 == '\''))) // 39
|| ( (p1 == 'S') // 83
&& (p2 == '"')))) // 34
{
return;
}
TokenInfo ss;
ctx.save(ss);
while ( ctx.more()
&& CharTable::IsKw2(ctx.peek()))
{
slen++;
pc.Str().append(ctx.get());
}
if ( forstring
&& slen >= 4
&& ( pc.GetStr().startswith("PRI", oldsize)
|| pc.GetStr().startswith("SCN", oldsize)))
{
ctx.restore(ss);
pc.Str().resize(oldsize);
}
}
} // parse_suffix
static bool is_bin(int ch)
{
return( (ch == '0') // 48
|| (ch == '1')); // 49
}
static bool is_bin_(int ch)
{
return( is_bin(ch)
|| ch == '_' // 95
|| ch == '\''); // 39
}
static bool is_oct(int ch)
{
return( (ch >= '0') // 48
&& (ch <= '7')); // 55
}
static bool is_oct_(int ch)
{
return( is_oct(ch)
|| ch == '_' // 95
|| ch == '\''); // 39
}
static bool is_dec(int ch)
{
return( (ch >= '0') // 48
&& (ch <= '9')); // 57
}
static bool is_dec_(int ch)
{
// number separators: JAVA: "_", C++14: "'"
return( is_dec(ch)
|| (ch == '_') // 95
|| (ch == '\'')); // 39
}
static bool is_hex(int ch)
{
return( ( (ch >= '0') // 48
&& (ch <= '9')) // 57
|| ( (ch >= 'a') // 97
&& (ch <= 'f')) // 102
|| ( (ch >= 'A') // 65
&& (ch <= 'F'))); // 70
}
static bool is_hex_(int ch)
{
return( is_hex(ch)
|| ch == '_' // 95
|| ch == '\''); // 39
}
static bool parse_number(TokenContext &ctx, Chunk &pc)
{
/*
* A number must start with a digit or a dot, followed by a digit
* (signs handled elsewhere)
*/
if ( !is_dec(ctx.peek())
&& ( (ctx.peek() != '.') // 46
|| !is_dec(ctx.peek(1))))
{
return(false);
}
bool is_float = (ctx.peek() == '.'); // 46
if ( is_float
&& (ctx.peek(1) == '.')) // make sure it isn't '..' 46
{
return(false);
}
/*
* Check for Hex, Octal, or Binary
* Note that only D, C++14 and Pawn support binary
* Fixes the issue # 1591
* In c# the numbers starting with 0 are not treated as octal numbers.
*/
bool did_hex = false;
if ( ctx.peek() == '0' // 48
&& !language_is_set(LANG_CS))
{
size_t ch;
Chunk pc_temp;
pc.Str().append(ctx.get()); // store the '0'
pc_temp.Str().append('0');
// MS constant might have an "h" at the end. Look for it
ctx.save();
while ( ctx.more()
&& CharTable::IsKw2(ctx.peek()))
{
ch = ctx.get();
pc_temp.Str().append(ch);
}
ch = pc_temp.GetStr()[pc_temp.Len() - 1];
ctx.restore();
LOG_FMT(LBCTRL, "%s(%d): pc_temp:%s\n", __func__, __LINE__, pc_temp.Text());
if (ch == 'h') // TODO can we combine this in analyze_character 104
{
// we have an MS hexadecimal number with "h" at the end
LOG_FMT(LBCTRL, "%s(%d): MS hexadecimal number\n", __func__, __LINE__);
did_hex = true;
do
{
pc.Str().append(ctx.get()); // store the rest
} while (is_hex_(ctx.peek()));
pc.Str().append(ctx.get()); // store the h
LOG_FMT(LBCTRL, "%s(%d): pc:%s\n", __func__, __LINE__, pc.Text());
}
else
{
switch (unc_toupper(ctx.peek()))
{
case 'X': // hex
did_hex = true;
do
{
pc.Str().append(ctx.get()); // store the 'x' and then the rest
} while (is_hex_(ctx.peek()));
break;
case 'B': // binary
do
{
pc.Str().append(ctx.get()); // store the 'b' and then the rest
} while (is_bin_(ctx.peek()));
break;
case '0': // octal or decimal
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
do
{
pc.Str().append(ctx.get());
} while (is_oct_(ctx.peek()));
break;
default:
// either just 0 or 0.1 or 0UL, etc
break;
} // switch
}
}
else
{
// Regular int or float
while (is_dec_(ctx.peek()))
{
pc.Str().append(ctx.get());
}
}
// Check if we stopped on a decimal point & make sure it isn't '..'
if ( (ctx.peek() == '.') // 46
&& (ctx.peek(1) != '.')) // 46
{
// Issue #1265, 5.clamp()
TokenInfo ss;
ctx.save(ss);
while ( ctx.more()
&& CharTable::IsKw2(ctx.peek(1)))
{
// skip characters to check for paren open
ctx.get();
}
if (ctx.peek(1) == '(') // 40
{
ctx.restore(ss);
pc.SetType(CT_NUMBER);
return(true);
}
else
{
ctx.restore(ss);
}
pc.Str().append(ctx.get());
is_float = true;
if (did_hex)
{
while (is_hex_(ctx.peek()))
{
pc.Str().append(ctx.get());
}
}
else
{
while (is_dec_(ctx.peek()))
{
pc.Str().append(ctx.get());
}
}
}
/*
* Check exponent
* Valid exponents per language (not that it matters):
* C/C++/D/Java: eEpP
* C#/Pawn: eE
*/
size_t tmp = unc_toupper(ctx.peek());
if ( (tmp == 'E') // 69
|| (tmp == 'P')) // 80
{
is_float = true;
pc.Str().append(ctx.get());
if ( (ctx.peek() == '+') // 43
|| (ctx.peek() == '-')) // 45
{
pc.Str().append(ctx.get());
}
while (is_dec_(ctx.peek()))
{
pc.Str().append(ctx.get());
}
}
/*
* Check the suffixes
* Valid suffixes per language (not that it matters):
* Integer Float
* C/C++: uUlL64 lLfF
* C#: uUlL fFdDMm
* D: uUL ifFL
* Java: lL fFdD
* Pawn: (none) (none)
*
* Note that i, f, d, and m only appear in floats.
*/
while (1)
{
size_t tmp2 = unc_toupper(ctx.peek());
// https://en.cppreference.com/w/cpp/language/floating_literal
if ( (tmp2 == 'I') // 73
|| (tmp2 == 'F') // 70
|| (tmp2 == 'D') // 68
|| (tmp2 == 'M')) // 77
{
// is a decimal point found? Issue #4027
const char *test_it = pc.Text();
size_t test_long = strlen(test_it);
bool point_found = false;
for (size_t ind = 0; ind < test_long; ind++)
{
if (test_it[ind] == '.')
{
point_found = true;
break;
}
}
if (point_found)
{
is_float = true;
}
else
{
// append the char(s) until is not IsKw2
while (ctx.more())
{
size_t ch = ctx.peek();
if (CharTable::IsKw2(ch))
{
pc.Str().append(ctx.get());
}
else
{
break;
}
}
pc.SetType(CT_WORD);
return(true);
}
}
else if ( (tmp2 != 'L') // 76
&& (tmp2 != 'U')) // 85
{
break;
}
pc.Str().append(ctx.get());
}
// skip the Microsoft-specific '8' suffix
if ((ctx.peek() == '8')) // 56
{
pc.Str().append(ctx.get());
}
// skip the Microsoft-specific '16', '32' and '64' suffix
if ( ( (ctx.peek() == '1') // 49
&& (ctx.peek(1) == '6')) // 54
|| ( (ctx.peek() == '3') // 51
&& (ctx.peek(1) == '2')) // 50
|| ( (ctx.peek() == '6') // 54
&& (ctx.peek(1) == '4'))) // 52
{
pc.Str().append(ctx.get());
pc.Str().append(ctx.get());
}
// skip the Microsoft-specific '128' suffix
if (( (ctx.peek() == '1') // 49
&& (ctx.peek(1) == '2') // 50
&& (ctx.peek(2) == '8'))) // 56
{
pc.Str().append(ctx.get());
pc.Str().append(ctx.get());
pc.Str().append(ctx.get());
}
pc.SetType(is_float ? CT_NUMBER_FP : CT_NUMBER);
/*
* If there is anything left, then we are probably dealing with garbage or
* some sick macro junk. Eat it.
*/
parse_suffix(ctx, pc);
return(true);
} // parse_number
static bool parse_string(TokenContext &ctx, Chunk &pc, size_t quote_idx, bool allow_escape)
{
log_rule_B("string_escape_char");
const size_t escape_char = options::string_escape_char();
log_rule_B("string_escape_char2");
const size_t escape_char2 = options::string_escape_char2();
log_rule_B("string_replace_tab_chars");
const bool should_escape_tabs = ( allow_escape
&& options::string_replace_tab_chars()
&& language_is_set(LANG_ALLC));
pc.Str().clear();
while (quote_idx-- > 0)
{
pc.Str().append(ctx.get());
}
pc.SetType(CT_STRING);
const size_t termination_character = CharTable::Get(ctx.peek()) & 0xff;
pc.Str().append(ctx.get()); // store the "
bool escaped = false;
while (ctx.more())
{
const size_t ch = ctx.get();
// convert char 9 (\t) to chars \t
if ( (ch == '\t')
&& should_escape_tabs)
{
const size_t lastcol = ctx.c.col - 1;
ctx.c.col = lastcol + 2;
pc.Str().append(escape_char);
pc.Str().append('t');
continue;
}
pc.Str().append(ch);
if (ch == '\n')
{
pc.SetNlCount(pc.GetNlCount() + 1);
pc.SetType(CT_STRING_MULTI);
}
else if ( ch == '\r'
&& ctx.peek() != '\n')
{
pc.Str().append(ctx.get());
pc.SetNlCount(pc.GetNlCount() + 1);
pc.SetType(CT_STRING_MULTI);
}
// if last char in prev loop was escaped the one in the current loop isn't
if (escaped)
{
escaped = false;
continue;
}
// see if the current char is a escape char
if (allow_escape)
{
if (ch == escape_char)
{
escaped = (escape_char != 0);
continue;
}
if ( ch == escape_char2
&& (ctx.peek() == termination_character))
{
escaped = allow_escape;
continue;
}
}
if (ch == termination_character)
{
break;
}
}
parse_suffix(ctx, pc, true);
return(true);
} // parse_string
enum cs_string_t
{
CS_STRING_NONE = 0,
CS_STRING_STRING = 1 << 0, // is any kind of string
CS_STRING_VERBATIM = 1 << 1, // @"" style string
CS_STRING_INTERPOLATED = 1 << 2, // $"" or $@"" style string
};
static cs_string_t operator|=(cs_string_t &value, cs_string_t other)
{
return(value = static_cast<cs_string_t>(value | other));
}
static cs_string_t parse_cs_string_start(TokenContext &ctx, Chunk &pc)
{
cs_string_t stringType = CS_STRING_NONE;
int offset = 0;
if (ctx.peek(offset) == '$') // 36
{
stringType |= CS_STRING_INTERPOLATED;
++offset;
}
if (ctx.peek(offset) == '@') // 64
{
stringType |= CS_STRING_VERBATIM;
++offset;
}
if (ctx.peek(offset) == '"') // 34
{
stringType |= CS_STRING_STRING;
pc.SetType(CT_STRING);
for (int i = 0; i <= offset; ++i)
{
pc.Str().append(ctx.get());
}
}
else
{
stringType = CS_STRING_NONE;
}
return(stringType);
} // parse_cs_string_start
struct CsStringParseState
{
cs_string_t type;
int braceDepth;
CsStringParseState(cs_string_t stringType)
{
type = stringType;
braceDepth = 0;
}
};
/**
* C# strings are complex enough (mostly due to interpolation and nesting) that they need a custom parser.
*/
static bool parse_cs_string(TokenContext &ctx, Chunk &pc)
{
cs_string_t stringType = parse_cs_string_start(ctx, pc);
if (stringType == CS_STRING_NONE)
{
return(false);
}
// an interpolated string can contain {expressions}, which can contain $"strings", which in turn
// can contain {expressions}, so we must track both as they are interleaved, in order to properly
// parse the outermost string.
std::stack<CsStringParseState> parseState; // each entry is a nested string
parseState.push(CsStringParseState(stringType));
log_rule_B("string_replace_tab_chars");
bool should_escape_tabs = options::string_replace_tab_chars();
while (ctx.more())
{
if (parseState.top().braceDepth > 0)
{
// all we can do when in an expr is look for expr close with }, or a new string opening. must do this first
// so we can peek and potentially consume chars for new string openings, before the ch=get() happens later,
// which is needed for newline processing.
if (ctx.peek() == '}') // 125
{
pc.Str().append(ctx.get());
if (ctx.peek() == '}') // 125
{
pc.Str().append(ctx.get()); // in interpolated string, `}}` is escape'd `}`
}
else
{
--parseState.top().braceDepth;
}
continue;
}
stringType = parse_cs_string_start(ctx, pc);
if (stringType != CS_STRING_NONE)
{
parseState.push(CsStringParseState(stringType));
continue;
}
}
int lastcol = ctx.c.col;
int ch = ctx.get();
pc.Str().append(ch);
if (ch == '\n')
{
pc.SetType(CT_STRING_MULTI);
pc.SetNlCount(pc.GetNlCount() + 1);
}
else if (ch == '\r')
{
pc.SetType(CT_STRING_MULTI);
}
else if (parseState.top().braceDepth > 0)
{
// do nothing. if we're in a brace, we only want the newline handling, and skip the rest.
}
else if ( (ch == '\t')
&& should_escape_tabs)
{
if (parseState.top().type & CS_STRING_VERBATIM)
{
if (!cpd.warned_unable_string_replace_tab_chars)
{
cpd.warned_unable_string_replace_tab_chars = true;
log_rule_B("warn_level_tabs_found_in_verbatim_string_literals");
log_sev_t warnlevel = (log_sev_t)options::warn_level_tabs_found_in_verbatim_string_literals();
/*
* a tab char can't be replaced with \\t because escapes don't
* work in here-strings. best we can do is warn.
*/
LOG_FMT(warnlevel, "%s(%d): %s: orig line is %zu, orig col is %zu, Detected non-replaceable tab char in literal string\n",
__func__, __LINE__, cpd.filename.c_str(), pc.GetOrigLine(), pc.GetOrigCol());
LOG_FMT(warnlevel, "%s(%d): Warning is given if doing tab-to-\\t replacement and we have found one in a C# verbatim string literal.\n",
__func__, __LINE__);
if (warnlevel < LWARN)
{
// TODO: replace the code ?? cpd.error_count++;
}
}
}
else
{
ctx.c.col = lastcol + 2;
pc.Str().pop_back(); // remove \t
pc.Str().append("\\t");
continue;
}
}
else if ( ch == '\\'
&& !(parseState.top().type & CS_STRING_VERBATIM))
{
// catch escaped quote in order to avoid ending string (but also must handle \\ to avoid accidental 'escape' seq of `\\"`)
if ( ctx.peek() == '"' // 34
|| ctx.peek() == '\\') // 92
{
pc.Str().append(ctx.get());
}
}
else if (ch == '"') // 34
{
if ( (parseState.top().type & CS_STRING_VERBATIM)
&& (ctx.peek() == '"')) // 34
{
// in verbatim string, `""` is escape'd `"`
pc.Str().append(ctx.get());
}
else
{
// end of string
parseState.pop();
if (parseState.empty())
{
break;
}
}
}
else if (parseState.top().type & CS_STRING_INTERPOLATED)
{
if (ch == '{') // 123
{
if (ctx.peek() == '{') // 123
{
pc.Str().append(ctx.get()); // in interpolated string, `{{` is escape'd `{`
}
else
{
++parseState.top().braceDepth;
}
}
}
}
return(true);
} // parse_cs_string
static void parse_verbatim_string(TokenContext &ctx, Chunk &pc)
{
pc.SetType(CT_STRING);
// consume the initial """
pc.Str() = ctx.get();
pc.Str().append(ctx.get());
pc.Str().append(ctx.get());
// go until we hit a zero (end of file) or a """
while (ctx.more())
{
size_t ch = ctx.get();
pc.Str().append(ch);
if ( (ch == '"') // 34
&& (ctx.peek() == '"') // 34
&& (ctx.peek(1) == '"')) // 34
{
pc.Str().append(ctx.get());
pc.Str().append(ctx.get());
break;
}
if ( (ch == '\n')
|| (ch == '\r'))
{
pc.SetType(CT_STRING_MULTI);
pc.SetNlCount(pc.GetNlCount() + 1);
}
}
}
static bool tag_compare(const deque<int> &d, size_t a_idx, size_t b_idx, size_t len)
{
if (a_idx != b_idx)
{
while (len-- > 0)
{
if (d[a_idx] != d[b_idx])
{
return(false);
}
}
}
return(true);
}
static bool parse_cr_string(TokenContext &ctx, Chunk &pc, size_t q_idx)
{
size_t tag_idx = ctx.c.idx + q_idx + 1;
size_t tag_len = 0;
ctx.save();
// Copy the prefix + " to the string
pc.Str().clear();
int cnt = q_idx + 1;
while (cnt--)
{
pc.Str().append(ctx.get());
}
// Add the tag and get the length of the tag
while ( ctx.more()
&& (ctx.peek() != '('))
{
tag_len++;
pc.Str().append(ctx.get());
}
if (ctx.peek() != '(')
{
ctx.restore();
return(false);
}
pc.SetType(CT_STRING);
while (ctx.more())
{
if ( (ctx.peek() == ')') // 41
&& (ctx.peek(tag_len + 1) == '"') // 34
&& tag_compare(ctx.data, tag_idx, ctx.c.idx + 1, tag_len))
{
cnt = tag_len + 2; // for the )"
while (cnt--)
{
pc.Str().append(ctx.get());
}
parse_suffix(ctx, pc);
return(true);
}
if (ctx.peek() == '\n')
{
pc.Str().append(ctx.get());
pc.SetNlCount(pc.GetNlCount() + 1);
pc.SetType(CT_STRING_MULTI);
}
else
{
pc.Str().append(ctx.get());
}
}
ctx.restore();
return(false);
} // parse_cr_string
/**
* Count the number of characters in a word.
* The first character is already valid for a keyword
*
* @param pc The structure to update, str is an input.
* @return Whether a word was parsed (always true)
*/
static bool parse_word(TokenContext &ctx, Chunk &pc, bool skipcheck)
{
static UncText intr_txt("@interface");
// The first character is already valid
pc.Str().clear();
pc.Str().append(ctx.get());
while (ctx.more())
{
size_t ch = ctx.peek();
if (CharTable::IsKw2(ch))
{
pc.Str().append(ctx.get());
}
else if ( (ch == '\\') // 92
&& (unc_tolower(ctx.peek(1)) == 'u')) // 117
{
pc.Str().append(ctx.get());
pc.Str().append(ctx.get());
skipcheck = true;
}
else
{
break;
}
// HACK: Non-ASCII character are only allowed in identifiers
if (ch > 0x7f)
{
skipcheck = true;
}
}
pc.SetType(CT_WORD);
if (skipcheck)
{
return(true);
}
// Detect pre-processor functions now
if ( cpd.in_preproc == CT_PP_DEFINE
&& cpd.preproc_ncnl_count == 1)
{
if (ctx.peek() == '(') // 40
{
pc.SetType(CT_MACRO_FUNC);
}
else
{
pc.SetType(CT_MACRO);
log_rule_B("pp_ignore_define_body");
if (options::pp_ignore_define_body())
{
/*
* We are setting the PP_IGNORE preproc state because the following
* chunks are part of the macro body and will have to be ignored.
*/
cpd.in_preproc = CT_PP_IGNORE;
}
}
}
else
{
// '@interface' is reserved, not an interface itself
if ( language_is_set(LANG_JAVA)
&& pc.GetStr().startswith("@")
&& !pc.GetStr().equals(intr_txt))
{
pc.SetType(CT_ANNOTATION);
}
else
{
// Turn it into a keyword now
// Issue #1460 will return "COMMENT_CPP"
pc.SetType(find_keyword_type(pc.Text(), pc.GetStr().size()));
/* Special pattern: if we're trying to redirect a preprocessor directive to PP_IGNORE,
* then ensure we're actually part of a preprocessor before doing the swap, or we'll
* end up with a function named 'define' as PP_IGNORE. This is necessary because with
* the config 'set' feature, there's no way to do a pair of tokens as a word
* substitution. */
if ( pc.GetType() == CT_PP_IGNORE
&& !cpd.in_preproc)
{
pc.SetType(find_keyword_type(pc.Text(), pc.GetStr().size()));
}
else if (pc.GetType() == CT_COMMENT_CPP) // Issue #1460
{
size_t ch;
bool is_cs = language_is_set(LANG_CS);
// read until EOL
while (true)
{
int bs_cnt = 0;
while (ctx.more())
{
ch = ctx.peek();
if ( (ch == '\r')
|| (ch == '\n'))
{
break;
}
if ( (ch == '\\') // 92
&& !is_cs) // backslashes aren't special in comments in C#
{
bs_cnt++;
}
else
{
bs_cnt = 0;
}
pc.Str().append(ctx.get());
}
/*
* If we hit an odd number of backslashes right before the newline,
* then we keep going.
*/
if ( ((bs_cnt & 1) == 0)
|| !ctx.more())
{
break;
}
if (ctx.peek() == '\r')
{
pc.Str().append(ctx.get());
}
if (ctx.peek() == '\n')
{
pc.Str().append(ctx.get());
}
pc.SetNlCount(pc.GetNlCount() + 1);
cpd.did_newline = true;
}
// Store off the end column
pc.SetOrigColEnd(ctx.c.col);
}
}
}
return(true);
} // parse_word
static size_t parse_attribute_specifier_sequence(TokenContext &ctx)
{
size_t nested = 0;
size_t offset = 0;
size_t parens = 0;
auto ch1 = ctx.peek(offset++);
while (ch1)
{
auto ch2 = ctx.peek(offset++);
while ( ch2 == ' ' // 32
|| ch2 == '\n'
|| ch2 == '\r'
|| ch2 == '\t')
{
ch2 = ctx.peek(offset++);
}
if ( nested == 0
&& ch2 != '[') // 91
{
break;
}
if (ch1 == '(') // 40
{
++parens;
ch1 = ch2;
continue;
}
if (ch1 == ')') // 41
{
if (parens == 0)
{
break;
}
--parens;
ch1 = ch2;
continue;
}
if ( ch1 != '['
&& ch1 != ']')
{
ch1 = ch2;
continue;
}
if (ch2 != ch1)
{
if (parens == 0)
{
break;
}
ch1 = ch2;
continue;
}
if (ch1 == '[') // 91
{
if ( nested != 0
&& parens == 0)
{
break;
}
++nested;
}
else if (--nested == 0)
{
return(offset);
}
ch1 = ctx.peek(offset++);
}
return(0);
} // parse_attribute_specifier_sequence
static bool extract_attribute_specifier_sequence(TokenContext &ctx, Chunk &pc, size_t length)
{
pc.Str().clear();
while (length--)
{
pc.Str().append(ctx.get());
}
pc.SetType(CT_ATTRIBUTE);
return(true);
} // extract_attribute_specifier_sequence
static bool parse_whitespace(TokenContext &ctx, Chunk &pc)
{
size_t nl_count = 0;
size_t ch = 0;
// REVISIT: use a better whitespace detector?
while ( ctx.more()
&& unc_isspace(ctx.peek()))
{
int lastcol = ctx.c.col;
ch = ctx.get(); // throw away the whitespace char
switch (ch)
{
case '\r':
if (ctx.expect('\n'))
{
// CRLF ending
++LE_COUNT(CRLF);
}
else
{
// CR ending
++LE_COUNT(CR);
}
nl_count++;
pc.SetOrigPrevSp(0);
break;
case '\n':
// LF ending
++LE_COUNT(LF);
nl_count++;
pc.SetOrigPrevSp(0);
break;
case '\t':
pc.SetOrigPrevSp(pc.GetOrigPrevSp() + ctx.c.col - lastcol);
break;
case ' ':
pc.SetOrigPrevSp(pc.GetOrigPrevSp() + 1);
break;
default:
break;
}
}
if (ch != 0)
{
pc.Str().clear();
pc.SetType(nl_count ? CT_NEWLINE : CT_WHITESPACE);
pc.SetNlCount(nl_count);
pc.SetAfterTab((ctx.c.last_ch == '\t'));
return(true);
}
return(false);
} // parse_whitespace
static bool parse_bs_newline(TokenContext &ctx, Chunk &pc)
{
ctx.save();
ctx.get(); // skip the '\'
size_t ch;
while ( ctx.more()
&& unc_isspace(ch = ctx.peek()))
{
ctx.get();
if ( (ch == '\r')
|| (ch == '\n'))
{
if (ch == '\r')
{
ctx.expect('\n');
}
pc.SetType(CT_NL_CONT);
pc.Str() = "\\";
pc.SetNlCount(1);
return(true);
}
}
ctx.restore();
return(false);
}
static bool parse_newline(TokenContext &ctx)
{
ctx.save();
// Eat whitespace
while ( (ctx.peek() == ' ') // 32
|| (ctx.peek() == '\t'))
{
ctx.get();
}
if ( (ctx.peek() == '\r')
|| (ctx.peek() == '\n'))
{
if (!ctx.expect('\n'))
{
ctx.get();
ctx.expect('\n');
}
return(true);
}
ctx.restore();
return(false);
}
static void parse_pawn_pattern(TokenContext &ctx, Chunk &pc, E_Token tt)
{
pc.Str().clear();
pc.SetType(tt);
while (!unc_isspace(ctx.peek()))
{
// end the pattern on an escaped newline
if (ctx.peek() == '\\') // 92
{
size_t ch = ctx.peek(1);
if ( (ch == '\n')
|| (ch == '\r'))
{
break;
}
}
pc.Str().append(ctx.get());
}
}
static bool parse_off_newlines(TokenContext &ctx, Chunk &pc)
{
size_t nl_count = 0;
// Parse off newlines/blank lines
while (parse_newline(ctx))
{
nl_count++;
}
if (nl_count > 0)
{
pc.SetNlCount(nl_count);
pc.SetType(CT_NEWLINE);
return(true);
}
return(false);
}
static bool parse_macro(TokenContext &ctx, Chunk &pc, const Chunk *prev_pc)
{
if (parse_off_newlines(ctx, pc))
{
return(true);
}
if (parse_comment(ctx, pc)) // allow CT_COMMENT_MULTI within macros
{
return(true);
}
ctx.save();
pc.Str().clear();
if (prev_pc->IsNullChunk())
{
return(false);
}
bool continued = ( prev_pc->Is(CT_NL_CONT)
|| prev_pc->Is(CT_COMMENT_MULTI));
while (ctx.more())
{
size_t pk = ctx.peek(), pk1 = ctx.peek(1);
bool nl = ( pk == '\n'
|| pk == '\r');
bool nl_cont = ( pk == '\\' // 92
&& ( pk1 == '\n'
|| pk1 == '\r'));
if ( ( nl_cont
|| ( continued
&& nl))
&& pc.GetStr().size() > 0)
{
pc.SetType(CT_PP_IGNORE);
return(true);
}
else if (nl)
{
break;
}
pc.Str().append(ctx.get());
}
pc.Str().clear();
ctx.restore();
return(false);
} // parse_macro
static bool parse_ignored(TokenContext &ctx, Chunk &pc)
{
if (parse_off_newlines(ctx, pc))
{
return(true);
}
// See if the options::enable_processing_cmt() or #pragma endasm / #endasm text is on this line
ctx.save();
pc.Str().clear();
while ( ctx.more()
&& (ctx.peek() != '\r')
&& (ctx.peek() != '\n'))
{
pc.Str().append(ctx.get());
}
if (pc.GetStr().size() == 0)
{
// end of file?
return(false);
}
// HACK: turn on if we find '#endasm' or '#pragma' and 'endasm' separated by blanks
if ( ( ( (pc.GetStr().find("#pragma ") >= 0)
|| (pc.GetStr().find("#pragma ") >= 0))
&& ( (pc.GetStr().find(" endasm") >= 0)
|| (pc.GetStr().find(" endasm") >= 0)))
|| (pc.GetStr().find("#endasm") >= 0))
{
cpd.unc_off = false;
ctx.restore();
pc.Str().clear();
return(false);
}
// Note that we aren't actually making sure this is in a comment, yet
log_rule_B("enable_processing_cmt");
const auto &ontext = options::enable_processing_cmt();
if (!ontext.empty())
{
bool found_enable_pattern = false;
if ( ontext != UNCRUSTIFY_ON_TEXT
&& options::processing_cmt_as_regex())
{
std::wstring pc_wstring(pc.GetStr().get().cbegin(),
pc.GetStr().get().cend());
std::wregex criteria(std::wstring(ontext.cbegin(),
ontext.cend()));
found_enable_pattern = std::regex_search(pc_wstring.cbegin(),
pc_wstring.cend(),
criteria);
}
else
{
found_enable_pattern = (pc.GetStr().find(ontext.c_str()) >= 0);
}
if (!found_enable_pattern)
{
pc.SetType(CT_IGNORED);
return(true);
}
}
ctx.restore();
// parse off whitespace leading to the comment
if (parse_whitespace(ctx, pc))
{
pc.SetType(CT_IGNORED);
return(true);
}
// Look for the ending comment and let it pass
if ( parse_comment(ctx, pc)
&& !cpd.unc_off)
{
return(true);
}
// Reset the chunk & scan to until a newline
pc.Str().clear();
while ( ctx.more()
&& (ctx.peek() != '\r')
&& (ctx.peek() != '\n'))
{
pc.Str().append(ctx.get());
}
if (pc.GetStr().size() > 0)
{
pc.SetType(CT_IGNORED);
return(true);
}
return(false);
} // parse_ignored
static bool parse_next(TokenContext &ctx, Chunk &pc, const Chunk *prev_pc)
{
if (!ctx.more())
{
return(false);
}
// Save off the current column
pc.SetType(CT_NONE);
pc.SetOrigLine(ctx.c.row);
pc.SetColumn(ctx.c.col);
pc.SetOrigCol(ctx.c.col);
pc.SetNlCount(0);
pc.SetFlags(PCF_NONE);
// If it is turned off, we put everything except newlines into CT_UNKNOWN
if (cpd.unc_off)
{
if (parse_ignored(ctx, pc))
{
return(true);
}
}
log_rule_B("disable_processing_nl_cont");
// Parse macro blocks
if (options::disable_processing_nl_cont())
{
if (parse_macro(ctx, pc, prev_pc))
{
return(true);
}
}
// Parse whitespace
if (parse_whitespace(ctx, pc))
{
return(true);
}
// Handle unknown/unhandled preprocessors
if ( cpd.in_preproc > CT_PP_BODYCHUNK
&& cpd.in_preproc <= CT_PP_OTHER)
{
pc.Str().clear();
TokenInfo ss;
ctx.save(ss);
// Chunk to a newline or comment
pc.SetType(CT_PREPROC_BODY);
size_t last = 0;
while (ctx.more())
{
size_t ch = ctx.peek();
// Fix for issue #1752
// Ignoring extra spaces after ' \ ' for preproc body continuations
if ( last == '\\' // 92
&& ch == ' ') // 32
{
ctx.get();
continue;
}
if ( (ch == '\n')
|| (ch == '\r'))
{
// Back off if this is an escaped newline
if (last == '\\') // 92
{
ctx.restore(ss);
pc.Str().pop_back();
}
break;
}
// Quit on a C or C++ comment start Issue #1966
if ( (ch == '/') // 47
&& ( (ctx.peek(1) == '/') // 47
|| (ctx.peek(1) == '*'))) // 42
{
break;
}
last = ch;
ctx.save(ss);
pc.Str().append(ctx.get());
}
if (pc.GetStr().size() > 0)
{
return(true);
}
}
// Detect backslash-newline
if ( (ctx.peek() == '\\') // 92
&& parse_bs_newline(ctx, pc))
{
return(true);
}
// Parse comments
if (parse_comment(ctx, pc))
{
return(true);
}
// Parse code placeholders
if (parse_code_placeholder(ctx, pc))
{
return(true);
}
if (language_is_set(LANG_CS))
{
if (parse_cs_string(ctx, pc))
{
return(true);
}
}
if (language_is_set(LANG_CS | LANG_VALA))
{
// check for non-keyword identifiers such as @if @switch, etc
// Vala also allows numeric identifiers if prefixed with '@'
if ( ctx.peek() == '@' // 64
&& ( CharTable::IsKw1(ctx.peek(1))
|| ( language_is_set(LANG_VALA)
&& CharTable::IsKw2(ctx.peek(1)))))
{
parse_word(ctx, pc, true);
return(true);
}
}
// handle VALA """ strings """
if ( language_is_set(LANG_VALA)
&& (ctx.peek() == '"') // 34
&& (ctx.peek(1) == '"') // 34
&& (ctx.peek(2) == '"')) // 34
{
parse_verbatim_string(ctx, pc);
return(true);
}
/*
* handle C++(11) string/char literal prefixes u8|u|U|L|R including all
* possible combinations and optional R delimiters: R"delim(x)delim"
*/
auto ch = ctx.peek();
if ( language_is_set(LANG_C | LANG_CPP)
&& ( ch == 'u' // 117
|| ch == 'U' // 85
|| ch == 'R' // 82
|| ch == 'L')) // 76
{
auto idx = size_t{};
auto is_real = false;
if ( ch == 'u' // 117
&& ctx.peek(1) == '8') // 56
{
idx = 2;
}
else if ( unc_tolower(ch) == 'u' // 117
|| ch == 'L') // 76
{
idx++;
}
if ( language_is_set(LANG_C | LANG_CPP)
&& ctx.peek(idx) == 'R') // 82
{
idx++;
is_real = true;
}
const auto quote = ctx.peek(idx);
if (is_real)
{
if ( quote == '"' // 34
&& parse_cr_string(ctx, pc, idx))
{
return(true);
}
}
else if ( ( quote == '"' // 34
|| quote == '\'') // 39
&& parse_string(ctx, pc, idx, true))
{
return(true);
}
}
// PAWN specific stuff
if (language_is_set(LANG_PAWN))
{
if ( cpd.preproc_ncnl_count == 1
&& ( cpd.in_preproc == CT_PP_DEFINE
|| cpd.in_preproc == CT_PP_EMIT))
{
parse_pawn_pattern(ctx, pc, CT_MACRO);
return(true);
}
// Check for PAWN strings: \"hi" or !"hi" or !\"hi" or \!"hi"
if ( (ctx.peek() == '\\') // 92
|| (ctx.peek() == '!')) // 33
{
if (ctx.peek(1) == '"') // 32
{
parse_string(ctx, pc, 1, (ctx.peek() == '!')); // 33
return(true);
}
if ( ( (ctx.peek(1) == '\\') // 92
|| (ctx.peek(1) == '!')) // 33
&& (ctx.peek(2) == '"')) // 32
{
parse_string(ctx, pc, 2, false);
return(true);
}
}
// handle PAWN preprocessor args %0 .. %9
if ( cpd.in_preproc == CT_PP_DEFINE
&& (ctx.peek() == '%') // 37
&& unc_isdigit(ctx.peek(1)))
{
pc.Str().clear();
pc.Str().append(ctx.get());
pc.Str().append(ctx.get());
pc.SetType(CT_WORD);
return(true);
}
}
// Parse strings and character constants
if (parse_number(ctx, pc))
{
return(true);
}
if (language_is_set(LANG_D))
{
// D specific stuff
if (d_parse_string(ctx, pc))
{
return(true);
}
}
else
{
// Not D stuff
// Check for L'a', L"abc", 'a', "abc", <abc> strings
ch = ctx.peek();
size_t ch1 = ctx.peek(1);
if ( ( ( (ch == 'L') // 76
|| (ch == 'S')) // 83
&& ( (ch1 == '"') // 34
|| (ch1 == '\''))) // 39
|| (ch == '"') // 34
|| (ch == '\'') // 39
|| ( (ch == '<') // 60
&& cpd.in_preproc == CT_PP_INCLUDE))
{
parse_string(ctx, pc, unc_isalpha(ch) ? 1 : 0, true);
if (cpd.in_preproc == CT_PP_INCLUDE)
{
pc.SetParentType(CT_PP_INCLUDE);
}
return(true);
}
if ( (ch == '<') // 60
&& cpd.in_preproc == CT_PP_DEFINE)
{
if (Chunk::GetTail()->Is(CT_MACRO))
{
// We have "#define XXX <", assume '<' starts an include string
parse_string(ctx, pc, 0, false);
return(true);
}
}
/* Inside clang's __has_include() could be "path/to/file.h" or system-style <path/to/file.h> */
Chunk *tail = Chunk::GetTail();
if ( (ch == '(') // 40
&& (tail->IsNotNullChunk())
&& ( tail->Is(CT_CNG_HASINC)
|| tail->Is(CT_CNG_HASINCN)))
{
parse_string(ctx, pc, 0, false);
return(true);
}
}
// Check for Vala string templates
if ( language_is_set(LANG_VALA)
&& (ctx.peek() == '@')) // 64
{
size_t nc = ctx.peek(1);
if (nc == '"') // 34
{
// literal string
parse_string(ctx, pc, 1, true);
return(true);
}
}
// Check for Objective C literals
if ( language_is_set(LANG_OC)
&& (ctx.peek() == '@')) // 64
{
size_t nc = ctx.peek(1);
if (nc == 'R') // Issue #2720 82
{
if (ctx.peek(2) == '"') // 34
{
if (parse_cr_string(ctx, pc, 2)) // Issue #3027
{
return(true);
}
// parse string without escaping
parse_string(ctx, pc, 2, false);
return(true);
}
}
if ( (nc == '"') // 34
|| (nc == '\'')) // 39
{
// literal string
parse_string(ctx, pc, 1, true);
return(true);
}
if ( (nc >= '0')
&& (nc <= '9'))
{
// literal number
pc.Str().append(ctx.get()); // store the '@'
parse_number(ctx, pc);
return(true);
}
}
// Check for pawn/ObjectiveC/Java and normal identifiers
if ( CharTable::IsKw1(ctx.peek())
|| ( (ctx.peek() == '\\') // 92
&& (unc_tolower(ctx.peek(1)) == 'u')) // 117
|| ( (ctx.peek() == '@') // 64
&& CharTable::IsKw1(ctx.peek(1))))
{
parse_word(ctx, pc, false);
return(true);
}
// Check for C++11/14/17/20 attribute specifier sequences
if ( language_is_set(LANG_CPP)
&& ctx.peek() == '[') // 91
{
if ( !language_is_set(LANG_OC)
|| ( prev_pc->IsNotNullChunk()
&& !prev_pc->Is(CT_OC_AT)))
{
if (auto length = parse_attribute_specifier_sequence(ctx))
{
extract_attribute_specifier_sequence(ctx, pc, length);
return(true);
}
}
}
// see if we have a punctuator
char punc_txt[7];
punc_txt[0] = ctx.peek();
punc_txt[1] = ctx.peek(1);
punc_txt[2] = ctx.peek(2);
punc_txt[3] = ctx.peek(3);
punc_txt[4] = ctx.peek(4);
punc_txt[5] = ctx.peek(5);
punc_txt[6] = '\0';
const chunk_tag_t *punc;
if ((punc = find_punctuator(punc_txt, cpd.lang_flags)) != nullptr)
{
int cnt = strlen(punc->tag);
while (cnt--)
{
pc.Str().append(ctx.get());
}
pc.SetType(punc->type);
pc.SetFlagBits(PCF_PUNCTUATOR);
return(true);
}
/* When parsing C/C++ files and running into some unknown token,
* check if matches Objective-C as a last resort, before
* considering it as garbage.
*/
int probe_lang_flags = 0;
if (language_is_set(LANG_C | LANG_CPP))
{
probe_lang_flags = cpd.lang_flags | LANG_OC;
}
if (probe_lang_flags != 0)
{
if ((punc = find_punctuator(punc_txt, probe_lang_flags)) != nullptr)
{
cpd.lang_flags = probe_lang_flags;
int cnt = strlen(punc->tag);
while (cnt--)
{
pc.Str().append(ctx.get());
}
pc.SetType(punc->type);
pc.SetFlagBits(PCF_PUNCTUATOR);
return(true);
}
}
// throw away this character
pc.SetType(CT_UNKNOWN);
pc.Str().append(ctx.get());
LOG_FMT(LWARN, "%s:%zu Garbage in col %zu: %x\n",
cpd.filename.c_str(), pc.GetOrigLine(), ctx.c.col, pc.GetStr()[0]);
exit(EX_SOFTWARE);
} // parse_next
int find_disable_processing_comment_marker(const UncText &text,
std::size_t start_idx)
{
log_rule_B("disable_processing_cmt");
const auto &offtext = options::disable_processing_cmt();
int idx = -1;
if ( !offtext.empty()
&& start_idx < text.size())
{
if ( offtext != UNCRUSTIFY_OFF_TEXT
&& options::processing_cmt_as_regex())
{
std::wsmatch match;
std::wstring pc_wstring(text.get().cbegin() + start_idx,
text.get().cend());
std::wregex criteria(std::wstring(offtext.cbegin(),
offtext.cend()));
std::regex_search(pc_wstring.cbegin(),
pc_wstring.cend(),
match,
criteria);
if (!match.empty())
{
idx = int(match.position() + start_idx);
}
}
else
{
idx = text.find(offtext.c_str(),
start_idx);
if (idx >= 0)
{
idx += int(offtext.size());
}
}
/**
* update the position to the start of the current line
*/
while ( idx > 0
&& text[idx - 1] != '\n')
{
--idx;
}
}
return(idx);
} // find_disable_processing_comment_marker
int find_enable_processing_comment_marker(const UncText &text,
std::size_t start_idx)
{
log_rule_B("enable_processing_cmt");
const auto &ontext = options::enable_processing_cmt();
int idx = -1;
if ( !ontext.empty()
&& start_idx < text.size())
{
if ( ontext != UNCRUSTIFY_ON_TEXT
&& options::processing_cmt_as_regex())
{
std::wsmatch match;
std::wstring pc_wstring(text.get().cbegin() + start_idx,
text.get().cend());
std::wregex criteria(std::wstring(ontext.cbegin(),
ontext.cend()));
std::regex_search(pc_wstring.cbegin(),
pc_wstring.cend(),
match,
criteria);
if (!match.empty())
{
idx = int(start_idx + match.position() + match.size());
}
}
else
{
idx = text.find(ontext.c_str(),
start_idx);
if (idx >= 0)
{
idx += int(ontext.size());
}
}
/**
* update the position to the end of the current line
*/
if (idx >= 0)
{
while ( idx < int(text.size())
&& text[idx] != '\n')
{
++idx;
}
}
}
return(idx);
} // find_enable_processing_comment_marker
void tokenize(const deque<int> &data, Chunk *ref)
{
TokenContext ctx(data);
Chunk chunk;
Chunk *pc = Chunk::NullChunkPtr;
Chunk *rprev = Chunk::NullChunkPtr;
bool last_was_tab = false;
size_t prev_sp = 0;
int num_stripped = 0; // Issue #1966
cpd.unc_stage = unc_stage_e::TOKENIZE;
while (ctx.more())
{
chunk.Reset();
chunk.SetPpLevel(0);
if (!parse_next(ctx, chunk, pc))
{
LOG_FMT(LERR, "%s:%zu Bailed before the end?\n",
cpd.filename.c_str(), ctx.c.row);
exit(EX_SOFTWARE);
}
if ( language_is_set(LANG_JAVA)
&& chunk.GetType() == CT_MEMBER
&& !memcmp(chunk.Text(), "->", 2))
{
chunk.SetType(CT_LAMBDA);
}
// Don't create an entry for whitespace
if (chunk.GetType() == CT_WHITESPACE)
{
last_was_tab = chunk.GetAfterTab();
prev_sp = chunk.GetOrigPrevSp();
continue;
}
chunk.SetOrigPrevSp(prev_sp);
prev_sp = 0;
if (chunk.GetType() == CT_NEWLINE)
{
last_was_tab = chunk.GetAfterTab();
chunk.SetAfterTab(false);
chunk.Str().clear();
}
else if (chunk.GetType() == CT_NL_CONT)
{
last_was_tab = chunk.GetAfterTab();
chunk.SetAfterTab(false);
chunk.Str() = "\\\n";
}
else
{
chunk.SetAfterTab(last_was_tab);
last_was_tab = false;
}
num_stripped = 0; // Issue #1966 and #3565
if (chunk.GetType() != CT_IGNORED)
{
// Issue #1338
// Strip trailing whitespace (for CPP comments and PP blocks)
while ( (chunk.GetStr().size() > 0)
&& ( (chunk.GetStr()[chunk.GetStr().size() - 1] == ' ') // 32
|| (chunk.GetStr()[chunk.GetStr().size() - 1] == '\t')))
{
// If comment contains backslash '\' followed by whitespace chars, keep last one;
// this will prevent it from turning '\' into line continuation.
if ( (chunk.GetStr().size() > 1)
&& (chunk.GetStr()[chunk.GetStr().size() - 2] == '\\'))
{
break;
}
chunk.Str().pop_back();
num_stripped++; // Issue #1966
}
}
// Store off the end column
chunk.SetOrigColEnd(ctx.c.col - num_stripped); // Issue #1966 and #3565
// Make the whitespace we disposed of be attributed to the next chunk
prev_sp = num_stripped;
// Add the chunk to the list
rprev = pc;
if (rprev->IsNotNullChunk())
{
pc->SetFlagBits(rprev->GetFlags() & PCF_COPY_FLAGS);
// a newline can't be in a preprocessor
if (pc->Is(CT_NEWLINE))
{
pc->ResetFlagBits(PCF_IN_PREPROC);
}
}
if (ref->IsNotNullChunk())
{
chunk.SetFlagBits(PCF_INSERTED);
}
else
{
chunk.ResetFlagBits(PCF_INSERTED);
}
pc = chunk.CopyAndAddBefore(ref);
// A newline marks the end of a preprocessor
if (pc->Is(CT_NEWLINE)) // || pc->Is(CT_COMMENT_MULTI))
{
cpd.in_preproc = CT_NONE;
cpd.preproc_ncnl_count = 0;
}
// Disable indentation when #asm directive found
if (pc->Is(CT_PP_ASM))
{
LOG_FMT(LBCTRL, "Found a directive %s on line %zu\n", "#asm", pc->GetOrigLine());
cpd.unc_off = true;
}
// Special handling for preprocessor stuff
if (cpd.in_preproc != CT_NONE)
{
pc->SetFlagBits(PCF_IN_PREPROC);
// Issue #2225
LOG_FMT(LBCTRL, "%s(%d): orig line is %zu, orig col is %zu, type is %s, parentType is %s\n",
__func__, __LINE__, pc->GetOrigLine(), pc->GetOrigCol(),
get_token_name(pc->GetType()), get_token_name(pc->GetParentType()));
if ( pc->Is(CT_STRING_MULTI)
&& pc->GetParentType() == CT_PP_INCLUDE)
{
LOG_FMT(LWARN, "%s:%zu: File name is not possible %s\n",
cpd.filename.c_str(), pc->GetOrigLine(), pc->Text());
exit(EX_SOFTWARE);
}
// Count words after the preprocessor
if (!pc->IsCommentOrNewline())
{
cpd.preproc_ncnl_count++;
}
// Disable indentation if a #pragma asm directive is found
if (cpd.in_preproc == CT_PP_PRAGMA)
{
if (memcmp(pc->Text(), "asm", 3) == 0)
{
LOG_FMT(LBCTRL, "Found a pragma %s on line %zu\n", "asm", pc->GetOrigLine());
cpd.unc_off = true;
}
}
// Figure out the type of preprocessor for #include parsing
if (cpd.in_preproc == CT_PREPROC)
{
if ( pc->GetType() < CT_PP_DEFINE
|| pc->GetType() > CT_PP_OTHER)
{
pc->SetType(CT_PP_OTHER);
}
cpd.in_preproc = pc->GetType();
}
else if (cpd.in_preproc == CT_PP_IGNORE)
{
if ( !pc->Is(CT_NL_CONT)
&& !pc->IsComment()) // Issue #1966
{
pc->SetType(CT_PP_IGNORE);
}
}
else if ( cpd.in_preproc == CT_PP_DEFINE
&& pc->Is(CT_PAREN_CLOSE)
&& options::pp_ignore_define_body())
{
log_rule_B("pp_ignore_define_body");
// When we have a PAREN_CLOSE in a PP_DEFINE we should be terminating a MACRO_FUNC
// arguments list. Therefore we can enter the PP_IGNORE state and ignore next chunks.
cpd.in_preproc = CT_PP_IGNORE;
}
}
else
{
// Check for a preprocessor start
if ( pc->Is(CT_POUND)
&& ( rprev->IsNullChunk()
|| rprev->Is(CT_NEWLINE)))
{
pc->SetType(CT_PREPROC);
pc->SetFlagBits(PCF_IN_PREPROC);
cpd.in_preproc = CT_PREPROC;
}
}
if (pc->Is(CT_NEWLINE))
{
LOG_FMT(LBCTRL, "%s(%d): orig line is %zu, orig col is %zu, <Newline>, nl is %zu\n",
__func__, __LINE__, pc->GetOrigLine(), pc->GetOrigCol(), pc->GetNlCount());
}
else if (pc->Is(CT_VBRACE_OPEN))
{
LOG_FMT(LBCTRL, "%s(%d): orig line is %zu, orig col is %zu, type is %s, orig col end is %zu\n",
__func__, __LINE__, pc->GetOrigLine(), pc->GetOrigCol(), get_token_name(pc->GetType()), pc->GetOrigColEnd());
}
else
{
char copy[1000];
LOG_FMT(LBCTRL, "%s(%d): orig line is %zu, orig col is %zu, Text() '%s', type is %s, orig col end is %zu\n",
__func__, __LINE__, pc->GetOrigLine(), pc->GetOrigCol(), pc->ElidedText(copy), get_token_name(pc->GetType()), pc->GetOrigColEnd());
}
}
// Set the cpd.newline string for this file
log_rule_B("newlines");
if ( options::newlines() == LE_LF
|| ( options::newlines() == LE_AUTO
&& (LE_COUNT(LF) >= LE_COUNT(CRLF))
&& (LE_COUNT(LF) >= LE_COUNT(CR))))
{
// LF line ends
cpd.newline = "\n";
LOG_FMT(LLINEENDS, "Using LF line endings\n");
}
else if ( options::newlines() == LE_CRLF
|| ( options::newlines() == LE_AUTO
&& (LE_COUNT(CRLF) >= LE_COUNT(LF))
&& (LE_COUNT(CRLF) >= LE_COUNT(CR))))
{
// CRLF line ends
cpd.newline = "\r\n";
LOG_FMT(LLINEENDS, "Using CRLF line endings\r\n");
}
else
{
// CR line ends
cpd.newline = "\r";
LOG_FMT(LLINEENDS, "Using CR line endings\n");
}
} // tokenize