You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tdewebdev/klinkstatus/src/parser/htmlparser.h

125 lines
3.6 KiB

/***************************************************************************
* Copyright (C) 2004 by Paulo Moura Guedes *
* moura@tdewebdev.org *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the *
* Free Software Foundation, Inc., *
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
***************************************************************************/
#ifndef HTML_PARSER_H
#define HTML_PARSER_H
#include <tqstring.h>
#include <vector>
#include "mstring.h"
#include "node.h"
#include <iostream>
using namespace std;
typedef unsigned int uint;
class HtmlParser
{
public:
HtmlParser();
HtmlParser(TQString const& documento);
~HtmlParser();
vector<Node*> const& nodes() const;
bool hasBaseUrl() const;
bool hasTitle() const;
bool hasContentType() const;
NodeBASE const& baseUrl() const;
NodeTITLE const& title() const;
NodeMETA const& contentTypeMetaNode() const;
static uint estimativaLinks(uint doc_size);
/**
* Convenience function for performance as it only parse in order
* to get the charset.
*/
static TQString findCharsetInMetaElement(TQString const& html);
// test:
void mostra() const;
private:
vector<TQString> const& parseNodesOfType(TQString const& element);
/**
* Vector nodes passed for performance.
*/
static void parseNodesOfType(TQString const& element, TQString const& doc, vector<TQString>& nodes);
void parseNodesOfTypeA();
void parseNodesOfTypeAREA();
void parseNodesOfTypeLINK();
void parseNodesOfTypeMETA();
void parseNodesOfTypeIMG();
void parseNodesOfTypeFRAME();
void parseNodesOfTypeIFRAME();
void parseNodesOfTypeBASE();
void parseNodesOfTypeTITLE();
void stripComments();
void stripScriptContent();
/**
Return the index of the next character of the end of tag.
e.g.
endOfTag("<img src=\"bad > luck\">") => 22 (not 15)
*/
static int endOfTag(TQString const& s, int index = 0, TQChar end_of_tag = '>');
private:
vector<TQString> aux_; // for what the hell is this? looks ugly... maybe I was drunk, can't remember
vector<Node*> nodes_;
NodeBASE node_BASE_;
NodeTITLE node_TITLE_;
NodeMETA node_META_content_type_;
bool is_content_type_set_;
TQString document_;
TQString script_; // Fica aqui guardado (JavaScript, etc)
TQString comments_;
};
inline HtmlParser::~HtmlParser()
{
//kdDebug(23100) << "*";
}
inline uint HtmlParser::estimativaLinks(uint doc_size)
{
return doc_size / 100; // valor estimado...
}
inline bool HtmlParser::hasContentType() const
{
return is_content_type_set_;
}
#endif