You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
342 lines
8.8 KiB
342 lines
8.8 KiB
//
|
|
// HtFile.cc
|
|
//
|
|
// HtFile: Interface classes for retriving local documents
|
|
//
|
|
// Including:
|
|
// - Generic class
|
|
//
|
|
// Part of the ht://Dig package <http://www.htdig.org/>
|
|
// Copyright (c) 1995-2004 The ht://Dig Group
|
|
// For copyright details, see the file COPYING in your distribution
|
|
// or the GNU Library General Public License (LGPL) version 2 or later
|
|
// <http://www.gnu.org/copyleft/lgpl.html>
|
|
//
|
|
// $Id: HtFile.cc,v 1.13 2004/05/28 13:15:23 lha Exp $
|
|
//
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "htconfig.h"
|
|
#endif /* HAVE_CONFIG_H */
|
|
|
|
#include "lib.h"
|
|
#include "Transport.h"
|
|
#include "HtFile.h"
|
|
#include "Dictionary.h"
|
|
#include "StringList.h"
|
|
#include "defaults.h" // for config
|
|
|
|
#include <signal.h>
|
|
#include <sys/types.h>
|
|
#include <ctype.h>
|
|
|
|
#ifdef HAVE_STD
|
|
#include <iostream>
|
|
#include <fstream>
|
|
#ifdef HAVE_NAMESPACES
|
|
using namespace std;
|
|
#endif
|
|
#else
|
|
#include <iostream.h>
|
|
#include <fstream.h>
|
|
#endif /* HAVE_STD */
|
|
|
|
#include <stdio.h> // for sscanf
|
|
#include <sys/stat.h>
|
|
|
|
#ifndef _MSC_VER /* _WIN32 */
|
|
#include <unistd.h>
|
|
#endif
|
|
|
|
#ifdef _MSC_VER /* _WIN32 */
|
|
#include "dirent_local.h"
|
|
#else
|
|
#include <dirent.h> // for scandir
|
|
#endif
|
|
|
|
#ifdef _MSC_VER /* _WIN32 */
|
|
#define popen _popen
|
|
#define pclose _pclose
|
|
#define lstat stat
|
|
#define readlink(x,y,z) {-1}
|
|
#endif
|
|
|
|
|
|
///////
|
|
// HtFile_Response class
|
|
///////
|
|
|
|
|
|
// Construction
|
|
|
|
HtFile_Response::HtFile_Response()
|
|
{
|
|
}
|
|
|
|
|
|
// Destruction
|
|
|
|
HtFile_Response::~HtFile_Response()
|
|
{
|
|
}
|
|
|
|
///////
|
|
// HtFile generic class
|
|
//
|
|
//
|
|
///////
|
|
|
|
|
|
// Construction
|
|
|
|
HtFile::HtFile()
|
|
{
|
|
}
|
|
|
|
// Destruction
|
|
|
|
HtFile::~HtFile()
|
|
{
|
|
// It's empty
|
|
}
|
|
|
|
|
|
// Return mime type indicated by extension ext (which is assumed not
|
|
// to contain the '.'), or NULL if ext is not a know mime type.
|
|
const String *HtFile::Ext2Mime (const char *ext)
|
|
{
|
|
static Dictionary *mime_map = 0;
|
|
|
|
if (!mime_map)
|
|
{
|
|
HtConfiguration* config= HtConfiguration::config();
|
|
mime_map = new Dictionary();
|
|
if (!mime_map)
|
|
return NULL;
|
|
|
|
if (debug > 2)
|
|
cout << "MIME types: " << config->Find("mime_types").get() << endl;
|
|
ifstream in(config->Find("mime_types").get());
|
|
if (in)
|
|
{
|
|
String line;
|
|
while (in >> line)
|
|
{
|
|
line.chop("\n\r \t");
|
|
int cmt;
|
|
if ((cmt = line.indexOf('#')) >= 0)
|
|
line = line.sub(0, cmt);
|
|
StringList split_line(line, "\t ");
|
|
// Let's cache mime type to lesser the number of
|
|
// operator [] callings
|
|
String mime_type = split_line[0];
|
|
// Fill map with values.
|
|
for (int i = 1; i < split_line.Count(); i++)
|
|
{
|
|
if (debug > 3)
|
|
cout << "MIME: " << split_line[i]
|
|
<< "\t-> " << mime_type << endl;
|
|
mime_map->Add(split_line[i], new String(mime_type));
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (debug > 2)
|
|
cout << "MIME types file not found. Using default types.\n";
|
|
mime_map->Add(String("html"), new String("text/html"));
|
|
mime_map->Add(String("htm"), new String("text/html"));
|
|
mime_map->Add(String("txt"), new String("text/plain"));
|
|
mime_map->Add(String("asc"), new String("text/plain"));
|
|
mime_map->Add(String("pdf"), new String("application/pdf"));
|
|
mime_map->Add(String("ps"), new String("application/postscript"));
|
|
mime_map->Add(String("eps"), new String("application/postscript"));
|
|
}
|
|
}
|
|
|
|
// return MIME type, or NULL if not found
|
|
return (String *)mime_map->Find(ext);
|
|
}
|
|
|
|
// Return mime type of the file named 'fname'.
|
|
// If the type can't be determined, "application/x-unknown" is returned.
|
|
String HtFile::File2Mime (const char *fname)
|
|
{
|
|
HtConfiguration* config= HtConfiguration::config();
|
|
|
|
// default to "can't identify"
|
|
char content_type [100] = "application/x-unknown\n";
|
|
|
|
String cmd = config->Find ("content_classifier");
|
|
if (cmd.get() && *cmd)
|
|
{
|
|
cmd << " \"" << fname << '\"'; // allow file names to have spaces
|
|
FILE *fileptr;
|
|
if ( (fileptr = popen (cmd.get(), "r")) != NULL )
|
|
{
|
|
fgets (content_type, sizeof (content_type), fileptr);
|
|
pclose (fileptr);
|
|
}
|
|
}
|
|
|
|
// Remove trailing newline, charset or language information
|
|
int delim = strcspn (content_type, ",; \n\t");
|
|
content_type [delim] = '\0';
|
|
|
|
if (debug > 1)
|
|
cout << "Mime type: " << fname << ' ' << content_type << endl;
|
|
return (String (content_type));
|
|
}
|
|
|
|
///////
|
|
// Manages the requesting process
|
|
///////
|
|
|
|
HtFile::DocStatus HtFile::Request()
|
|
{
|
|
// Reset the response
|
|
_response.Reset();
|
|
|
|
struct stat stat_buf;
|
|
|
|
String path (_url.path());
|
|
decodeURL (path); // Convert '%20' to ' ' etc
|
|
|
|
// Check that it exists, and is a regular file or directory
|
|
// Don't allow symbolic links to directories; they mess up '../'.
|
|
// Should we allow FIFO's?
|
|
if ( stat(path.get(), &stat_buf) != 0 ||
|
|
!(S_ISREG(stat_buf.st_mode) || S_ISDIR(stat_buf.st_mode)) )
|
|
{
|
|
return Transport::Document_not_found;
|
|
}
|
|
|
|
// Now handle directories with a pseudo-HTML document (and appropriate noindex)
|
|
if ( S_ISDIR(stat_buf.st_mode) )
|
|
{
|
|
_response._content_type = "text/html";
|
|
_response._contents = "<html><head><meta name=\"robots\" content=\"noindex\">\n";
|
|
|
|
struct dirent *namelist;
|
|
DIR *dirList;
|
|
String filename;
|
|
String encodedName;
|
|
|
|
if (( dirList = opendir(path.get()) ))
|
|
{
|
|
while (( namelist = readdir(dirList) ))
|
|
{
|
|
filename = path;
|
|
filename << namelist->d_name;
|
|
|
|
if ( namelist->d_name[0] != '.'
|
|
&& lstat(filename.get(), &stat_buf) == 0 )
|
|
{
|
|
// Recursively resolve symbolic links.
|
|
// Could leave "absolute" links, or even all not
|
|
// containing '../'. That would allow "aliasing" of
|
|
// directories without causing loops.
|
|
|
|
int i; // avoid infinite loops
|
|
for (i=0; (stat_buf.st_mode & S_IFMT) == S_IFLNK && i<10; i++)
|
|
{
|
|
char link [100];
|
|
int count = readlink(filename.get(), link, sizeof(link)-1);
|
|
|
|
if (count < 0)
|
|
break;
|
|
link [count] = '\0';
|
|
encodedName = link;
|
|
encodeURL (encodedName);
|
|
URL newURL (encodedName, _url); // resolve relative paths
|
|
filename = newURL.path();
|
|
decodeURL (filename);
|
|
if (debug > 2)
|
|
cout << "Link to " << link << " gives "
|
|
<< filename.get() << endl;
|
|
lstat(filename.get(), &stat_buf);
|
|
}
|
|
// filename now only sym-link if nested too deeply or I/O err.
|
|
|
|
encodeURL (filename, UNRESERVED "/"); // convert ' ' to '%20' etc., but leave "/" intact
|
|
if (S_ISDIR(stat_buf.st_mode))
|
|
_response._contents << "<link href=\"file://"
|
|
<< filename.get() << "/\">\n";
|
|
else if (S_ISREG(stat_buf.st_mode))
|
|
_response._contents << "<link href=\"file://"
|
|
<< filename.get() << "\">\n";
|
|
}
|
|
}
|
|
closedir(dirList);
|
|
}
|
|
|
|
_response._contents << "</head><body></body></html>\n";
|
|
|
|
if (debug > 4)
|
|
cout << " Directory listing: " << endl << _response._contents << endl;
|
|
|
|
_response._content_length = stat_buf.st_size;
|
|
_response._document_length = _response._contents.length();
|
|
_response._modification_time = new HtDateTime(stat_buf.st_mtime);
|
|
_response._status_code = 0;
|
|
return Transport::Document_ok;
|
|
}
|
|
|
|
if (_modification_time && *_modification_time >= HtDateTime(stat_buf.st_mtime))
|
|
return Transport::Document_not_changed;
|
|
|
|
bool unknown_ext = false;
|
|
char *ext = strrchr(path.get(), '.');
|
|
if (ext == NULL)
|
|
unknown_ext = true;
|
|
else
|
|
{
|
|
const String *mime_type = Ext2Mime(ext + 1);
|
|
if (mime_type)
|
|
_response._content_type = *mime_type;
|
|
else
|
|
unknown_ext = true;
|
|
}
|
|
if (unknown_ext)
|
|
{
|
|
_response._content_type = File2Mime (path.get());
|
|
if (!strncmp (_response._content_type.get(), "application/x-", 14))
|
|
return Transport::Document_not_local;
|
|
}
|
|
|
|
_response._modification_time = new HtDateTime(stat_buf.st_mtime);
|
|
|
|
FILE *f = fopen((const char *)path.get(), "r");
|
|
if (f == NULL)
|
|
return Document_not_found;
|
|
|
|
char docBuffer[8192];
|
|
int bytesRead;
|
|
while ((bytesRead = fread(docBuffer, 1, sizeof(docBuffer), f)) > 0)
|
|
{
|
|
if (_response._contents.length() + bytesRead > _max_document_size)
|
|
bytesRead = _max_document_size - _response._contents.length();
|
|
_response._contents.append(docBuffer, bytesRead);
|
|
if (_response._contents.length() >= _max_document_size)
|
|
break;
|
|
}
|
|
fclose(f);
|
|
|
|
_response._content_length = stat_buf.st_size;
|
|
_response._document_length = _response._contents.length();
|
|
_response._status_code = 0;
|
|
|
|
if (debug > 2)
|
|
cout << "Read a total of " << _response._document_length << " bytes\n";
|
|
return Transport::Document_ok;
|
|
}
|
|
|
|
HtFile::DocStatus HtFile::GetDocumentStatus()
|
|
{
|
|
// Let's give a look at the return status code
|
|
if (_response._status_code == -1)
|
|
return Transport::Document_not_found;
|
|
return Transport::Document_ok;
|
|
}
|
|
|