You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tdeaddons/konq-plugins/webarchiver/archivedialog.cpp

566 lines
16 KiB

/*
Copyright (C) 2001 Andreas Schlapbach <schlpbch@iam.unibe.ch>
Copyright (C) 2003 Antonio Larrosa <larrosa@kde.org>
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; see the file COPYING. If not, write to
the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
Boston, MA 02110-1301, USA.
*/
#include "archivedialog.h"
#include <tqwidget.h>
#include <tdehtml_part.h>
#include "archiveviewbase.h"
#include <kinstance.h>
#include <ktempfile.h>
#include <ktar.h>
#include <tdefiledialog.h>
#include <kmessagebox.h>
#include <kpassivepopup.h>
#include <klocale.h>
#include <tdeio/netaccess.h>
#include <tdehtml_part.h>
#include <kdebug.h>
#include <kgenericfactory.h>
#include <kactivelabel.h>
#include <tqstylesheet.h>
#include <tqiodevice.h>
#include <klistview.h>
#include <tdeio/job.h>
#include <kapplication.h>
#include <kurllabel.h>
#include <kprogress.h>
#include <kstringhandler.h>
#include <tqpushbutton.h>
#undef DEBUG_WAR
#define CONTENT_TYPE "<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\">"
ArchiveDialog::ArchiveDialog(TQWidget *parent, const TQString &filename,
KHTMLPart *part) :
KDialogBase(parent, "WebArchiveDialog", false, i18n("Web Archiver"),
KDialogBase::Ok | KDialogBase::Cancel | KDialogBase::User1 ),
m_bPreserveWS(false), m_tmpFile(0), m_url(part->url())
{
m_widget=new ArchiveViewBase(this);
setMainWidget(m_widget);
setWFlags(getWFlags() | WDestructiveClose);
m_widget->urlLabel->setText(TQString("<a href=\"")+m_url.url()+"\">"+KStringHandler::csqueeze( m_url.url(), 80 )+"</a>");
m_widget->targetLabel->setText(TQString("<a href=\"")+filename+"\">"+KStringHandler::csqueeze( filename, 80 )+"</a>");
if(part->document().ownerDocument().isNull())
m_document = part->document();
else
m_document = part->document().ownerDocument();
enableButtonOK( false );
showButton( KDialogBase::User1, false );
setButtonOK( KStdGuiItem::close() );
m_tarBall = new KTar(filename,"application/x-gzip");
}
void ArchiveDialog::archive()
{
m_iterator=0;
m_currentLVI=0;
if (m_tarBall->open(IO_WriteOnly)) {
#ifdef DEBUG_WAR
kdDebug(90110) << "Web Archive opened " << endl;
#endif
m_linkDict.insert(TQString("index.html"), TQString(""));
saveFile("index.html");
} else {
const TQString title = i18n( "Unable to Open Web-Archive" );
const TQString text = i18n( "Unable to open \n %1 \n for writing." ).arg(m_tarBall->fileName());
KMessageBox::sorry( 0L, text, title );
}
}
ArchiveDialog::~ArchiveDialog()
{
delete m_tarBall;
}
/* Store the HTMLized DOM-Tree to a temporary file and add it to the Tar-Ball */
void ArchiveDialog::saveFile( const TQString&)
{
KTempFile tmpFile;
if (!(tmpFile.status())) {
TQString temp;
m_state=Retrieving;
TQTextStream *tempStream = new TQTextStream(&temp, IO_ReadOnly);
saveToArchive(tempStream);
delete tempStream;
m_downloadedURLDict.clear();
m_state=Downloading;
m_widget->progressBar->setTotalSteps(m_urlsToDownload.count());
m_widget->progressBar->setProgress(0);
downloadNext();
} else {
const TQString title = i18n( "Could Not Open Temporary File" );
const TQString text = i18n( "Could not open a temporary file" );
KMessageBox::sorry( 0, text, title );
}
}
void ArchiveDialog::setSavingState()
{
KTempFile tmpFile;
TQTextStream* textStream = tmpFile.textStream();
textStream->setEncoding(TQTextStream::UnicodeUTF8);
m_widget->progressBar->setProgress(m_widget->progressBar->totalSteps());
m_state=Saving;
saveToArchive(textStream);
tmpFile.close();
TQString fileName="index.html";
TQFile file(tmpFile.name());
file.open(IO_ReadOnly);
m_tarBall->writeFile(fileName, TQString(), TQString(), file.size(), file.readAll());
#ifdef DEBUG_WAR
kdDebug(90110) << "HTML-file written: " << fileName << endl;
#endif
file.close();
// Cleaning up
file.remove();
m_tarBall->close();
KPassivePopup::message( m_url.prettyURL() , i18n( "Archiving webpage completed." ), this );
enableButtonOK(true);
setEscapeButton(Ok);
actionButton(Ok)->setFocus();
enableButtonCancel(false);
}
/* Recursively travers the DOM-Tree */
void ArchiveDialog::saveToArchive(TQTextStream* _textStream)
{
if (!_textStream) return;
// Add a doctype
(*_textStream) <<"<!-- saved from:" << endl << m_url.url() << " -->" << endl;
try
{
saveArchiveRecursive(m_document.documentElement(), m_url, _textStream, 0);
}
catch (...)
{
kdDebug(90110) << "exception" << endl;
}
}
static bool hasAttribute(const DOM::Node &pNode, const TQString &attrName, const TQString &attrValue)
{
const DOM::Element element = (const DOM::Element) pNode;
DOM::Attr attr;
DOM::NamedNodeMap attrs = element.attributes();
unsigned long lmap = attrs.length();
for( unsigned int j=0; j<lmap; j++ ) {
attr = static_cast<DOM::Attr>(attrs.item(j));
if ((attr.name().string().upper() == attrName) &&
(attr.value().string().upper() == attrValue))
return true;
}
return false;
}
static bool hasChildNode(const DOM::Node &pNode, const TQString &nodeName)
{
DOM::Node child;
try
{
// We might throw a DOM exception
child = pNode.firstChild();
}
catch (...)
{
// No children, stop recursion here
child = DOM::Node();
}
while(!child.isNull()) {
if (child.nodeName().string().upper() == nodeName)
return true;
child = child.nextSibling();
}
return false;
}
/* Transform DOM-Tree to HTML */
void ArchiveDialog::saveArchiveRecursive(const DOM::Node &pNode, const KURL& baseURL,
TQTextStream* _textStream, int indent)
{
const TQString nodeNameOrig(pNode.nodeName().string());
const TQString nodeName(pNode.nodeName().string().upper());
TQString text;
TQString strIndent;
strIndent.fill(' ', indent);
const DOM::Element element = (const DOM::Element) pNode;
DOM::Node child;
if ( !element.isNull() ) {
if (nodeName.at(0)=='-') {
/* Don't save tdehtml internal tags '-konq..'
* Approximating it with <DIV>
*/
text += "<DIV> <!-- -KONTQ_BLOCK -->";
} else if (nodeName == "BASE") {
/* Skip BASE, everything is relative to index.html
* Saving SCRIPT but they can cause trouble!
*/
} else if ((nodeName == "META") && hasAttribute(pNode, "HTTP-EQUIV", "CONTENT-TYPE")) {
/* Skip content-type meta tag, we provide our own.
*/
} else {
if (!m_bPreserveWS) {
if (nodeName == "PRE") {
m_bPreserveWS = true;
}
text = strIndent;
}
text += "<" + nodeNameOrig;
TQString attributes;
TQString attrNameOrig, attrName, attrValue;
DOM::Attr attr;
DOM::NamedNodeMap attrs = element.attributes();
unsigned long lmap = attrs.length();
for( unsigned int j=0; j<lmap; j++ ) {
attr = static_cast<DOM::Attr>(attrs.item(j));
attrNameOrig = attr.name().string();
attrName = attrNameOrig.upper();
attrValue = attr.value().string();
#if 0
if ((nodeName == "FRAME" || nodeName == "IFRAME") && attrName == "SRC") {
//attrValue = handleLink(baseURL, attrValue);
/* Going recursively down creating a DOM-Tree for the Frame, second Level of recursion */
//## Add Termination criteria, on the other hand frames are not indefinetly nested, are they :)
KHTMLPart* part = new KHTMLPart();
KURL absoluteURL = getAbsoluteURL(baseURL, attrValue);
part->openURL(absoluteURL);
saveFile(getUniqueFileName(absoluteURL.fileName()), part);
delete part;
} else if
#endif
if ((nodeName == "LINK" && attrName == "HREF") || // Down load stylesheets, js-script, ..
((nodeName == "FRAME" || nodeName == "IFRAME") && attrName == "SRC") ||
((nodeName == "IMG" || nodeName == "INPUT" || nodeName == "SCRIPT") && attrName == "SRC") ||
((nodeName == "BODY" || nodeName == "TABLE" || nodeName == "TH" || nodeName == "TD") && attrName == "BACKGROUND")) {
// Some people use carriage return in file names and browsers support that!
attrValue = handleLink(baseURL, attrValue.replace(TQRegExp("\\s"), ""));
}
/*
* ## Make recursion level configurable
*/
/*
} else if (nodeName == "A" && attrName == "HREF") {
attrValue = handleLink(baseURL, attrValue);
*/
attributes += " " + attrName + "=\"" + attrValue + "\"";
}
if (!(attributes.isEmpty())){
text += " ";
}
text += attributes.simplifyWhiteSpace();
text += ">";
if (nodeName == "HTML") {
/* Search for a HEAD tag, if not found, generate one.
*/
if (!hasChildNode(pNode, "HEAD"))
text += "\n" + strIndent + " <HEAD>" CONTENT_TYPE "</HEAD>";
}
else if (nodeName == "HEAD") {
text += "\n" + strIndent + " " + CONTENT_TYPE;
}
}
} else {
const TQString& nodeValue(pNode.nodeValue().string());
if (!(nodeValue.isEmpty())) {
// Don't escape < > in JS or CSS
TQString parentNodeName = pNode.parentNode().nodeName().string().upper();
if (parentNodeName == "STYLE") {
text = analyzeInternalCSS(baseURL, pNode.nodeValue().string());
} else if (m_bPreserveWS) {
text = TQStyleSheet::escape(pNode.nodeValue().string());
} else if (parentNodeName == "SCRIPT") {
text = pNode.nodeValue().string();
} else {
text = strIndent + TQStyleSheet::escape(pNode.nodeValue().string());
}
}
}
#ifdef DEBUG_WAR
kdDebug(90110) << "text:" << text << endl;
#endif
if (!(text.isEmpty())) {
(*_textStream) << text;
if (!m_bPreserveWS) {
(*_textStream) << endl;
}
}
try
{
// We might throw a DOM exception
child = pNode.firstChild();
}
catch (...)
{
// No children, stop recursion here
child = DOM::Node();
}
while(!child.isNull()) {
saveArchiveRecursive(child, baseURL, _textStream, indent+2);
child = child.nextSibling();
}
if (!(element.isNull())) {
if (nodeName == "AREA" || nodeName == "BASE" || nodeName == "BASEFONT" ||
nodeName == "BR" || nodeName == "COL" || nodeName == "FRAME" ||
nodeName == "HR" || nodeName == "IMG" || nodeName == "INPUT" ||
nodeName == "ISINDEX" || nodeName == "META" || nodeName == "PARAM") {
/* Closing Tag is forbidden, see HTML 4.01 Specs: Index of Elements */
} else {
if (!m_bPreserveWS) {
text = strIndent;
} else {
text ="";
}
if (nodeName.at(0)=='-') {
text += "</DIV> <!-- -KONTQ_BLOCK -->";
} else {
text += "</" + pNode.nodeName().string() + ">";
if (nodeName == "PRE") {
m_bPreserveWS = false;
}
}
#ifdef DEBUG_WAR
kdDebug(90110) << text << endl;
#endif
if (!(text.isEmpty())) {
(*_textStream) << text;
if (!m_bPreserveWS) {
(*_textStream) << endl;
}
}
}
}
}
/* Extract the URL, download it's content and return an unique name for the link */
TQString ArchiveDialog::handleLink(const KURL& _url, const TQString& _link)
{
KURL url(getAbsoluteURL(_url, _link));
TQString tarFileName;
if (kapp->authorizeURLAction("redirect", _url, url))
{
if (m_state==Retrieving)
m_urlsToDownload.append(url);
else if (m_state==Saving)
tarFileName = m_downloadedURLDict[url.url()];
}
return tarFileName;
}
void ArchiveDialog::downloadNext()
{
if (m_iterator>=m_urlsToDownload.count())
{
// We've already downloaded all the files we wanted, let's save them
setSavingState();
return;
}
KURL url=m_urlsToDownload[m_iterator];
#ifdef DEBUG_WAR
kdDebug(90110) << "URL : " << url.url() << endl;
#endif
TQString tarFileName;
// Only download file once
if (m_downloadedURLDict.contains(url.url())) {
tarFileName = m_downloadedURLDict[url.url()];
#ifdef DEBUG_WAR
kdDebug(90110) << "File already downloaded: " << url.url()
<< m_downloadedURLDict.count() << endl;
#endif
m_iterator++;
downloadNext();
return;
} else {
// Gets the name of a temporary file into m_tmpFileName
delete m_tmpFile;
m_tmpFile=new KTempFile();
m_tmpFile->close();
TQFile::remove(m_tmpFile->name());
kdDebug(90110) << "downloading: " << url.url() << " to: " << m_tmpFile->name() << endl;
KURL dsturl;
dsturl.setPath(m_tmpFile->name());
TDEIO::Job *job=TDEIO::file_copy(url, dsturl, -1, false, false, false);
job->addMetaData("cache", "cache"); // Use entry from cache if available.
connect(job, TQT_SIGNAL(result( TDEIO::Job *)), this, TQT_SLOT(finishedDownloadingURL( TDEIO::Job *)) );
m_currentLVI=new TQListViewItem(m_widget->listView, url.prettyURL());
m_widget->listView->insertItem( m_currentLVI );
m_currentLVI->setText(1,i18n("Downloading"));
}
#ifdef DEBUG_WAR
kdDebug(90110) << "TarFileName: [" << tarFileName << "]" << endl << endl;
#endif
}
void ArchiveDialog::finishedDownloadingURL( TDEIO::Job *job )
{
if ( job->error() )
{
// TQString s=job->errorString();
m_currentLVI->setText(1,i18n("Error"));
}
else
m_currentLVI->setText(1,i18n("Ok"));
m_widget->progressBar->advance(1);
KURL url=m_urlsToDownload[m_iterator];
TQString tarFileName = getUniqueFileName(url.fileName());
// Add file to Tar-Ball
TQFile file(m_tmpFile->name());
file.open(IO_ReadOnly);
m_tarBall->writeFile(tarFileName, TQString(), TQString(), file.size(), file.readAll());
file.close();
m_tmpFile->unlink();
delete m_tmpFile;
m_tmpFile=0;
// Add URL to downloaded URLs
m_downloadedURLDict.insert(url.url(), tarFileName);
m_linkDict.insert(tarFileName, TQString(""));
m_iterator++;
downloadNext();
}
/* Create an absolute URL for download */
KURL ArchiveDialog::getAbsoluteURL(const KURL& _url, const TQString& _link)
{
// Does all the magic for me
return KURL(_url, _link);
}
/* Adds an id to a fileName to make it unique relative to the Tar-Ball */
TQString ArchiveDialog::getUniqueFileName(const TQString& fileName)
{
// Name clash -> add unique id
static int id=2;
TQString uniqueFileName(fileName);
#ifdef DEBUG_WAR
kdDebug(90110) << "getUniqueFileName(..): [" << fileName << "]" << endl;
#endif
while (uniqueFileName.isEmpty() || m_linkDict.contains(uniqueFileName))
uniqueFileName = TQString::number(id++) + fileName;
return uniqueFileName;
}
/* Search for Images in CSS, extract them and adjust CSS */
TQString ArchiveDialog::analyzeInternalCSS(const KURL& _url, const TQString& string)
{
#ifdef DEBUG_WAR
kdDebug () << "analyzeInternalCSS" << endl;
#endif
TQString str(string);
int pos = 0;
int startUrl = 0;
int endUrl = 0;
int length = string.length();
while (pos < length && pos >= 0) {
pos = str.find("url(", pos);
if (pos!=-1) {
pos += 4; // url(
if (str[pos]=='"' || str[pos]=='\'') // CSS 'feature'
pos++;
startUrl = pos;
pos = str.find(")",startUrl);
endUrl = pos;
if (str[pos-1]=='"' || str[pos-1]=='\'') // CSS 'feature'
endUrl--;
TQString url = str.mid(startUrl, endUrl-startUrl);
#ifdef DEBUG_WAR
kdDebug () << "url: " << url << endl;
#endif
url = handleLink(_url, url);
#ifdef DEBUG_WAR
kdDebug () << "url: " << url << endl;
#endif
str = str.replace(startUrl, endUrl-startUrl, url);
pos++;
}
}
return str;
}
#include "archivedialog.moc"