/* This file is part of KIllustrator. Copyright (C) 1998 Kai-Uwe Sattler (kus@iti.cs.uni-magdeburg.de) modified for kvoctrain by Ewald Arnold kvoctrain@ewald-arnold.dein April ´99 ----------------------------------------------------------------------- This program is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU Library General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "XmlTokenizer.h" #include XmlTokenizer::XmlTokenizer (KOXML_ISTREAM& is) : last_chars (""), istrm (is), use_last (false), is_open (false), lineno(1) { } XmlTokenizer::~XmlTokenizer () { } /* bool XmlTokenizer::hasMoreTokens () { return ! istrm.eof (); } */ void XmlTokenizer::skipWhitespace () { KOXML_CHAR c; do { c = readchar (); if (c == '\n') lineno++; if (! isspace (c)) { putback (c); return; } else if (istrm.eof ()) return; } while (1); } void XmlTokenizer::putback (KOXML_CHAR c) { last_chars += c; } KOXML_CHAR XmlTokenizer::readchar () { KOXML_CHAR c; if (last_chars.length() > 0) { c = last_chars[0]; KOXML_STRING_REMOVE (last_chars, 0, 1); } else { # ifndef KOXML_USE_STL istrm >> c; # else istrm.get(c); # endif } return c; } void XmlTokenizer::unget () { use_last = true; } XmlTokenizer::Token XmlTokenizer::nextToken () { KOXML_CHAR c; if (use_last) { use_last = false; return last_tok; } skipWhitespace (); if (istrm.eof ()) return last_tok = Tok_EOF; c = readchar (); if (c == '\n') lineno++; if (!is_open) { if (c != '<') { putback (c); return last_tok = readText (); } } switch (c) { case '<': is_open = true; return last_tok = Tok_Lt; break; case '>': is_open = false; return last_tok = Tok_Gt; break; case '?': return last_tok = Tok_TQSign; break; case '/': return last_tok = Tok_Slash; break; case '=': return last_tok = Tok_Eq; break; case '(': return last_tok = Tok_LParen; break; case ')': return last_tok = Tok_RParen; break; case '[': return last_tok = Tok_LBracket; break; case ']': return last_tok = Tok_RBracket; break; case '|': return last_tok = Tok_Bar; break; case '*': return last_tok = Tok_Asterisk; break; case '+': return last_tok = Tok_Plus; break; case ',': return last_tok = Tok_Comma; break; case ';': return last_tok = Tok_Semicolon; break; case '%': return last_tok = Tok_Percent; break; case '#': return last_tok = Tok_NSign; break; case '\'': return last_tok = Tok_Apostr; break; case '"': // String einlesen return last_tok = readString (); break; default: if (is_open) { if (isalpha (c) || isdigit (c)) { // Symbol (Element oder Attributbezeichner) putback (c); return last_tok = readSymbol (); } else if (c == '!') { c = readchar (); if (c == '\n') lineno++; putback (c); if (c == '-') return last_tok = readComment (); else return last_tok = Tok_Exclam; } else { return last_tok = Tok_Invalid; } } else { putback (c); return last_tok = readText (); } break; } } const KOXML_STRING& XmlTokenizer::element () { return elem; } XmlTokenizer::Token XmlTokenizer::readSymbol () { KOXML_CHAR c; elem = ""; while (1) { c = readchar (); if (c == '\n') lineno++; if (istrm.eof () || isspace (c)) // Symbol ist abgeschlossen break; else if (c == '=' || c == '/' || c == '>' || c == '?' || c == '|' || c == ')' || c == '\'' || c == ',' || c == ';') { // Symbol ist abgeschlossen, das gelesene Zeichen wird // aber noch benoetigt putback (c); break; } else if (isalnum (c) || c == '-' || (c == '_' && elem.length () > 0)) // korrektes Zeichen -> anhaengen // elem += tolower (c); ????????? elem += c; else { // Zeichen nicht erlaubt ? return Tok_Invalid; } } // alle Grossbuchstaben in Kleinbuchstaben aendern !!!! return Tok_Symbol; } XmlTokenizer::Token XmlTokenizer::readString () { KOXML_CHAR c; elem = ""; while (1) { c = readchar (); if (c == '\n') lineno++; if (istrm.eof ()) // String ist noch nicht abgeschlossen return Tok_Invalid; else if (c == '\\') { // naechstes Zeichen quoten } else if (c == '"') { // String ist abgeschlossen return Tok_String; } else elem += c; } } XmlTokenizer::Token XmlTokenizer::readComment () { KOXML_CHAR c1, c2; elem = ""; c1 = readchar (); if (c1 == '\n') lineno++; c2 = readchar (); if (c2 == '\n') lineno++; if (c1 != '-' || c2 != '-' || istrm.eof ()) return Tok_Invalid; while (1) { c1 = readchar (); if (istrm.eof ()) return Tok_Invalid; else if (c1 == '\n') lineno++; else if (c1 == '>') return Tok_Comment; else elem += c1; } } XmlTokenizer::Token XmlTokenizer::readText () { KOXML_CHAR c; elem = ""; while (1) { c = readchar (); if (c == '\n') lineno++; if (istrm.eof ()) return Tok_EOF; else if (c == '<') { putback (c); return Tok_Text; } else if (c == '&') { KOXML_STRING s; while (c != ';') { s += c; c = readchar (); if (c == '\n') lineno++; if (istrm.eof ()) return Tok_EOF; } if (s == "<") elem += "<"; else if (s == ">") elem += ">"; else if (s == "&") elem += "&"; else if (s == "&lf") elem += "\r"; else if (s == "&nl") elem += "\n"; // entities ? // elem += "[" + s +";]"; } else elem += c; } } #ifdef TEST int main (int argc, char** argv) { XmlTokenizer::Token tok; XmlTokenizer tokenizer (cin); while ((tok = tokenizer.nextToken ()) != XmlTokenizer::Tok_EOF) { switch (tok) { case XmlTokenizer::Tok_Exclam: cout << "! "; break; case XmlTokenizer::Tok_Bar: cout << "| "; break; case XmlTokenizer::Tok_LParen: cout << "( "; break; case XmlTokenizer::Tok_RParen: cout << ") "; break; case XmlTokenizer::Tok_LBracket: cout << "[ "; break; case XmlTokenizer::Tok_RBracket: cout << "] "; break; case XmlTokenizer::Tok_Plus: cout << "+ "; break; case XmlTokenizer::Tok_Asterisk: cout << "* "; break; case XmlTokenizer::Tok_Comma: cout << ", "; break; case XmlTokenizer::Tok_Semicolon: cout << "; "; break; case XmlTokenizer::Tok_NSign: cout << "# "; break; case XmlTokenizer::Tok_Apostr: cout << "' "; break; case XmlTokenizer::Tok_Percent: cout << "% "; break; case XmlTokenizer::Tok_Lt: cout << "< "; break; case XmlTokenizer::Tok_Gt: cout << "> "; break; case XmlTokenizer::Tok_TQSign: cout << "? "; break; case XmlTokenizer::Tok_Slash: cout << "/ "; break; case XmlTokenizer::Tok_Eq: cout << "= "; break; case XmlTokenizer::Tok_Symbol: cout << "SYMBOL(" << tokenizer.element () << ") "; break; case XmlTokenizer::Tok_String: cout << "STRING(" << tokenizer.element () << ") "; break; case XmlTokenizer::Tok_Comment: cout << "COMMENT > "; break; case XmlTokenizer::Tok_Text: cout << "TEXT(" << tokenizer.element () << ") "; break; default: cout << "INVALID(" << tok << ")" << endl; return 1; break; } } } #endif