tdewebdev/klinkstatus/src/parser/htmlparser.cpp

/***************************************************************************
 *   Copyright (C) 2004 by Paulo Moura Guedes                              *
 *   moura@kdewebdev.org                                                        *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, write to the                         *
 *   Free Software Foundation, Inc.,                                       *
 *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.             *
 ***************************************************************************/

#include "htmlparser.h"

#include <kapplication.h>
#include <kdebug.h>


HtmlParser::HtmlParser(TQString const& documento)
    : is_content_type_set_(false), document_(documento)
{
    Q_ASSERT(!documento.isEmpty());

    stripScriptContent();
    stripComments(); // after removing the script because comments in scripts have diferent sintaxe

    nodes_.reserve(estimativaLinks(documento.length() * 2)); // à confiança ;)

    parseNodesOfTypeA();
    parseNodesOfTypeAREA();
    parseNodesOfTypeLINK();
    parseNodesOfTypeMETA();
    parseNodesOfTypeIMG();
    parseNodesOfTypeFRAME();
    parseNodesOfTypeIFRAME();
    parseNodesOfTypeBASE();
    parseNodesOfTypeTITLE();
}

bool HtmlParser::hasBaseUrl() const
{
    return (node_BASE_.element() == Node::BASE &&
            !node_BASE_.url().isEmpty());
}

NodeBASE const& HtmlParser::baseUrl() const
{
    Q_ASSERT(hasBaseUrl());
    return node_BASE_;
}

NodeMETA const& HtmlParser::contentTypeMetaNode() const
{
    Q_ASSERT(hasContentType());
    return node_META_content_type_;
}

bool HtmlParser::hasTitle() const
{
    return (node_TITLE_.element() == Node::TITLE &&
            !node_TITLE_.attributeTITLE().isEmpty());
}

NodeTITLE const& HtmlParser::title() const
{
    Q_ASSERT(hasTitle());
    return node_TITLE_;
}

vector<TQString> const& HtmlParser::parseNodesOfType(TQString const& element)
{
    HtmlParser::parseNodesOfType(element, document_, aux_);
    return aux_;
}

void HtmlParser::parseNodesOfType(TQString const& tipo, TQString const& document, vector<TQString>& nodes)
{
    TQString node;
    TQString doc(document);
    int inicio = 0, fim = 0;

    nodes.clear();
    if(upperCase(tipo) == "A")
        nodes.reserve(estimativaLinks(doc.length() * 2));

    while(true)
    {
        inicio = findSeparableWord(doc, "<" + tipo);
        if(inicio == -1)
            return;

        //if( (doc[inicio] != ' ' && doc[inicio] != '\n' && doc[inicio] != '\r') )
        if(!::isSpace(doc[inicio]))
        {
            doc.remove(0, TQString("<" + tipo).length());
            continue;
        }

        if(upperCase(tipo) == "A")
            fim = findWord(doc, "</A>", inicio);
        else
        {
            //fim = findChar(doc, '>', inicio + 1);
            fim = endOfTag(doc, inicio, '>');
        }

        if(fim == -1)
        {
            doc.remove(0, 1);
            continue;
        }

        int tag_begining_go_back = (tipo.length() + TQString("<").length());
        node = doc.mid(inicio - tag_begining_go_back,
                       fim - inicio + tag_begining_go_back);
        nodes.push_back(node);
        doc.remove(0, fim);
    }
}

int HtmlParser::endOfTag(TQString const& s, int index, TQChar end_of_tag)
{
    if( (uint)index >= s.length() )
        return -1;

    int _end_of_tag = s.tqfind(end_of_tag, index);
    if(_end_of_tag == -1)
        return _end_of_tag;

    int open_aspas = s.tqfind('"', index);
    if(open_aspas == -1)
        return _end_of_tag + 1;

    else if(_end_of_tag < open_aspas)
        return _end_of_tag + 1;

    else if( ((uint)open_aspas + 1) >= s.length() - 1 )
        return -1;

    else
    {
        int close_aspas = s.tqfind('"', open_aspas + 1);
        if(close_aspas != -1)
            return endOfTag(s, close_aspas + 1, end_of_tag);
        else
        {
            kdDebug(23100) <<  "Mismatched quotes (\"): " << s.mid(index, _end_of_tag - index) << endl;
            //return -1;
            return _end_of_tag + 1;
        }
    }
}

vector<Node*> const& HtmlParser::nodes() const
{
    return nodes_;
}


void HtmlParser::parseNodesOfTypeA()
{
    vector<TQString> const& aux = parseNodesOfType("A");

    for(vector<TQString>::size_type i = 0; i != aux.size(); ++i)
    {
        nodes_.push_back( new NodeA(aux[i]) );
    }
}

void HtmlParser::parseNodesOfTypeAREA()
{
    vector<TQString> const& aux = parseNodesOfType("AREA");

    for(vector<TQString>::size_type i = 0; i != aux.size(); ++i)
    {
        nodes_.push_back( new NodeAREA(aux[i]) );
    }
}

void HtmlParser::parseNodesOfTypeLINK()
{
    vector<TQString> const& aux = parseNodesOfType("LINK");

    for(vector<TQString>::size_type i = 0; i != aux.size(); ++i)
        nodes_.push_back( new NodeLINK(aux[i]) );
}

void HtmlParser::parseNodesOfTypeMETA()
{
    vector<TQString> const& aux = parseNodesOfType("META");

    for(vector<TQString>::size_type i = 0; i != aux.size(); ++i)
    {
        NodeMETA* node = new NodeMETA(aux[i]);
        nodes_.push_back(node);

        if(!is_content_type_set_ && node->atributoHTTP_ETQUIV().lower() == TQString("Content-Type").lower()) {
            is_content_type_set_ = true;
            node_META_content_type_.setNode(aux[i]);
        }
    }
}

TQString HtmlParser::findCharsetInMetaElement(TQString const& html)
{
    vector<TQString> metaTags;
    parseNodesOfType("META", html, metaTags);

    for(vector<TQString>::size_type i = 0; i != metaTags.size(); ++i)
    {
        NodeMETA node(metaTags[i]);

        if(node.atributoHTTP_ETQUIV().lower() == TQString("Content-Type").lower()) {
            return node.charset();
        }
    }
    return TQString();
}

void HtmlParser::parseNodesOfTypeIMG()
{
    vector<TQString> const& aux = parseNodesOfType("IMG");

    for(vector<TQString>::size_type i = 0; i != aux.size(); ++i)
        nodes_.push_back( new NodeIMG(aux[i]) );
}

void HtmlParser::parseNodesOfTypeFRAME()
{
    vector<TQString> const& aux = parseNodesOfType("FRAME");

    for(vector<TQString>::size_type i = 0; i != aux.size(); ++i)
        nodes_.push_back( new NodeFRAME(aux[i]) );
}

void HtmlParser::parseNodesOfTypeIFRAME()
{
    vector<TQString> const& aux = parseNodesOfType("IFRAME");

    for(vector<TQString>::size_type i = 0; i != aux.size(); ++i)
        nodes_.push_back( new NodeFRAME(aux[i]) );
}

void HtmlParser::parseNodesOfTypeBASE()
{
    TQString node;
    TQString doc = document_;
    int inicio = 0, fim = 0;

    inicio = findSeparableWord(doc, "<BASE");
    if(inicio == -1 || !doc[inicio].isSpace())
        return;

    fim = doc.tqfind(">", inicio);
    if(fim == -1)
        return;

    node = doc.mid(inicio, fim-inicio);
    node_BASE_.setNode(node);
}

void HtmlParser::parseNodesOfTypeTITLE()
{
    TQString node;
    TQString doc = document_;
    int inicio = 0, fim = 0;

    inicio = findSeparableWord(doc, "<TITLE>");
    if(inicio == -1)
        return;

    fim = findSeparableWord(doc, "</TITLE>", inicio);
    if(fim == -1)
        return;

    node = doc.mid(inicio, fim-inicio);

    node_TITLE_.setNode(node);
}


void HtmlParser::stripComments()
{
    TQString begin_comment = "<!--";
    TQString end_comment = "-->";
    uint const begin_comment_length = begin_comment.length();

    int inicio = -1;
    do
    {
        inicio = findWord(document_, begin_comment);
        if(inicio != -1)
        {
            int fim = findWord(document_, end_comment, inicio);
            if(fim == -1)
            {
                kdDebug(23100) <<  "End of comment is missing!" << endl;
                document_.remove(inicio - begin_comment_length, begin_comment_length);
            }
            else
            {
                comments_ += "\n" + document_.mid(inicio - begin_comment_length,
                                                  fim - inicio + begin_comment_length);
                document_.remove(inicio - begin_comment_length, fim - inicio + begin_comment_length);
            }
        }
    }
    while(inicio != -1);
}

void HtmlParser::stripScriptContent()
{
    int inicio = -1;
    TQString const begin_script = "<script";
    TQString const end_script = "</script>";
    uint const begin_script_length = begin_script.length();

    do
    {
        inicio = findWord(document_, begin_script);
        if(inicio != -1)
        {
            int fim = findWord(document_, end_script, inicio);

            if(fim == -1)
            {
                kdDebug(23100) <<  "Malformed script tag!" << endl;
                document_.remove(inicio - begin_script_length, begin_script_length);
            }
            else
            {
                script_ += "\n" + document_.mid(inicio - begin_script_length,
                                                fim - inicio + begin_script_length);

                document_.remove(inicio - begin_script_length,
                                 fim - inicio + begin_script_length);
            }
        }
    }
    while(inicio != -1);
}


#include <iostream>
void HtmlParser::mostra() const
{
    kdDebug(23100) << "\nA:\n\n";
    for(unsigned int i = 0; i != nodes_.size(); ++i)
    {
        if(nodes_[i]->element() == Node::A)
            kdDebug(23100) << nodes_[i]->url() << "\t" << nodes_[i]->linkLabel() << endl;
    }
    kdDebug(23100) << "____________________________________________________________________" << endl;

    kdDebug(23100) << "\nLINK:\n\n";
    for(unsigned int i = 0; i != nodes_.size(); ++i)
    {
        if(nodes_[i]->element() == Node::LINK)
            kdDebug(23100) << nodes_[i]->url() << "\t" << nodes_[i]->linkLabel() << endl;
    }
    kdDebug(23100) << "____________________________________________________________________" << endl;

    kdDebug(23100) << "\nMETA:\n";
    for(unsigned int i = 0; i != nodes_.size(); ++i)
    {
        if(nodes_[i]->element() == Node::META)
        {
#if defined TQ_WS_WIN
            NodeMETA* nm = (NodeMETA*)nodes_[i];
#else

            NodeMETA* nm = dynamic_cast<NodeMETA*>(nodes_[i]);
#endif

            kdDebug(23100) << nm->url() << endl
            << nm->atributoHTTP_ETQUIV() << endl
            << nm->atributoNAME() << endl
            << nm->atributoCONTENT() << endl;
        }
    }
    kdDebug(23100) << "____________________________________________________________________" << endl;

    kdDebug(23100) << "\nIMG:\n\n";
    for(unsigned int i = 0; i != nodes_.size(); ++i)
    {
        if(nodes_[i]->element() == Node::IMG)
            kdDebug(23100) << nodes_[i]->url() << "\t"
            << nodes_[i]->linkLabel() << endl;
    }
    kdDebug(23100) << "____________________________________________________________________" << endl;

    kdDebug(23100) << "\nFRAME:\n\n";
    for(unsigned int i = 0; i != nodes_.size(); ++i)
    {
        if(nodes_[i]->element() == Node::FRAME)
            kdDebug(23100) << nodes_[i]->url() << endl;
    }
    kdDebug(23100) << "____________________________________________________________________" << endl;

    kdDebug(23100) << "\nBASE:\n\n";
    kdDebug(23100) << node_BASE_.url() << endl;

    kdDebug(23100) << "____________________________________________________________________" << endl;

}

#ifdef HTMLPARSER

#include <fstream>

int main()
{
    //ifstream stream("aterraprometida.html");
    //ifstream stream("/var/www/html/STL/standard_library.html");
    //ifstream stream("/var/www/html/qt-doc/functions.html");
    ifstream stream("/var/www/html/index.html");

    TQString content;
    while(stream)
    {
        char c;
        stream.get(c);
        content += c;
    }
    //  kdDebug(23100) << content << endl;
    kdDebug(23100) <<  "__________________________________________________________" << endl;
    HtmlParser parser(content);
    parser.mostra();
    kdDebug(23100) <<  "__________________________________________________________\n\n\n" << endl;
    vector<Node*> nods = parser.nodes();
    for(int i = 0; i != nods.size(); ++i)
    {
        if(nods[i]->element() == Node::META)
        {
            NodeMETA* nod_meta = (NodeMETA*)(nods[i]);
            //Node* nod_meta = nods[i];

            kdDebug(23100) << nod_meta->atributoCONTENT() << endl;
        }

    }
}


#endif