|
|
|
/*
|
|
|
|
This file is part of Akregator.
|
|
|
|
|
|
|
|
Copyright (C) 2004 Teemu Rytilahti <tpr@d5k.net>
|
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
along with this program; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
|
|
|
|
As a special exception, permission is given to link this program
|
|
|
|
with any edition of TQt, and distribute the resulting executable,
|
|
|
|
without including the source code for TQt in the source distribution.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <tqregexp.h>
|
|
|
|
#include <tqstring.h>
|
|
|
|
#include <tqstringlist.h>
|
|
|
|
#include <tqvaluelist.h>
|
|
|
|
#include <kcharsets.h>
|
|
|
|
|
|
|
|
#include "feeddetector.h"
|
|
|
|
|
|
|
|
|
|
|
|
using namespace Akregator;
|
|
|
|
|
|
|
|
FeedDetectorEntryList FeedDetector::extractFromLinkTags(const TQString& s)
|
|
|
|
{
|
|
|
|
//reduce all sequences of spaces, newlines etc. to one space:
|
|
|
|
TQString str = s.simplifyWhiteSpace();
|
|
|
|
|
|
|
|
// extracts <link> tags
|
|
|
|
TQRegExp reLinkTag("<[\\s]?LINK[^>]*REL[\\s]?=[\\s]?\\\"[\\s]?(ALTERNATE|SERVICE\\.FEED)[\\s]?\\\"[^>]*>", false);
|
|
|
|
|
|
|
|
// extracts the URL (href="url")
|
|
|
|
TQRegExp reHref("HREF[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false);
|
|
|
|
// extracts type attribute
|
|
|
|
TQRegExp reType("TYPE[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false);
|
|
|
|
// extracts the title (title="title")
|
|
|
|
TQRegExp reTitle("TITLE[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false);
|
|
|
|
|
|
|
|
int pos = 0;
|
|
|
|
int matchpos = 0;
|
|
|
|
|
|
|
|
// get all <link> tags
|
|
|
|
TQStringList linkTags;
|
|
|
|
//int strlength = str.length();
|
|
|
|
while ( matchpos != -1 )
|
|
|
|
{
|
|
|
|
matchpos = reLinkTag.search(str, pos);
|
|
|
|
if (matchpos != -1)
|
|
|
|
{
|
|
|
|
linkTags.append( str.mid(matchpos, reLinkTag.matchedLength()) );
|
|
|
|
pos = matchpos + reLinkTag.matchedLength();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
FeedDetectorEntryList list;
|
|
|
|
|
|
|
|
for ( TQStringList::Iterator it = linkTags.begin(); it != linkTags.end(); ++it )
|
|
|
|
{
|
|
|
|
TQString type;
|
|
|
|
int pos = reType.search(*it, 0);
|
|
|
|
if (pos != -1)
|
|
|
|
type = reType.cap(1).lower();
|
|
|
|
|
|
|
|
// we accept only type attributes indicating a feed
|
|
|
|
if ( type != "application/rss+xml" && type != "application/rdf+xml"
|
|
|
|
&& type != "application/atom+xml" && type != "text/xml" )
|
|
|
|
continue;
|
|
|
|
|
|
|
|
TQString title;
|
|
|
|
pos = reTitle.search(*it, 0);
|
|
|
|
if (pos != -1)
|
|
|
|
title = reTitle.cap(1);
|
|
|
|
|
|
|
|
title = KCharsets::resolveEntities(title);
|
|
|
|
|
|
|
|
TQString url;
|
|
|
|
pos = reHref.search(*it, 0);
|
|
|
|
if (pos != -1)
|
|
|
|
url = reHref.cap(1);
|
|
|
|
|
|
|
|
url = KCharsets::resolveEntities(url);
|
|
|
|
|
|
|
|
// if feed has no title, use the url as preliminary title (until feed is parsed)
|
|
|
|
if ( title.isEmpty() )
|
|
|
|
title = url;
|
|
|
|
|
|
|
|
if ( !url.isEmpty() )
|
|
|
|
list.append(FeedDetectorEntry(url, title) );
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return list;
|
|
|
|
}
|
|
|
|
|
|
|
|
TQStringList FeedDetector::extractBruteForce(const TQString& s)
|
|
|
|
{
|
|
|
|
TQString str = s.simplifyWhiteSpace();
|
|
|
|
|
|
|
|
TQRegExp reAhrefTag("<[\\s]?A[^>]?HREF=[\\s]?\\\"[^\\\"]*\\\"[^>]*>", false);
|
|
|
|
|
|
|
|
// extracts the URL (href="url")
|
|
|
|
TQRegExp reHref("HREF[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false);
|
|
|
|
|
|
|
|
TQRegExp rssrdfxml(".*(RSS|RDF|XML)", false);
|
|
|
|
|
|
|
|
int pos = 0;
|
|
|
|
int matchpos = 0;
|
|
|
|
|
|
|
|
// get all <a href> tags and capture url
|
|
|
|
TQStringList list;
|
|
|
|
//int strlength = str.length();
|
|
|
|
while ( matchpos != -1 )
|
|
|
|
{
|
|
|
|
matchpos = reAhrefTag.search(str, pos);
|
|
|
|
if ( matchpos != -1 )
|
|
|
|
{
|
|
|
|
TQString ahref = str.mid(matchpos, reAhrefTag.matchedLength());
|
|
|
|
int hrefpos = reHref.search(ahref, 0);
|
|
|
|
if ( hrefpos != -1 )
|
|
|
|
{
|
|
|
|
TQString url = reHref.cap(1);
|
|
|
|
|
|
|
|
url = KCharsets::resolveEntities(url);
|
|
|
|
|
|
|
|
if ( rssrdfxml.exactMatch(url) )
|
|
|
|
list.append(url);
|
|
|
|
}
|
|
|
|
|
|
|
|
pos = matchpos + reAhrefTag.matchedLength();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return list;
|
|
|
|
|
|
|
|
}
|