diff src/gt_web_cache.c @ 0:d39e1d0d75b6

initial add
author paulo@hit-nxdomain.opendns.com
date Sat, 20 Feb 2010 21:18:28 -0800
parents
children
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/gt_web_cache.c	Sat Feb 20 21:18:28 2010 -0800
     1.3 @@ -0,0 +1,925 @@
     1.4 +/*
     1.5 + * $Id: gt_web_cache.c,v 1.65 2006/08/06 16:53:36 hexwab Exp $
     1.6 + *
     1.7 + * Copyright (C) 2001-2003 giFT project (gift.sourceforge.net)
     1.8 + *
     1.9 + * This program is free software; you can redistribute it and/or modify it
    1.10 + * under the terms of the GNU General Public License as published by the
    1.11 + * Free Software Foundation; either version 2, or (at your option) any
    1.12 + * later version.
    1.13 + *
    1.14 + * This program is distributed in the hope that it will be useful, but
    1.15 + * WITHOUT ANY WARRANTY; without even the implied warranty of
    1.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    1.17 + * General Public License for more details.
    1.18 + */
    1.19 +
    1.20 +#include "gt_gnutella.h"
    1.21 +
    1.22 +#include "file_cache.h"
    1.23 +#include "http_request.h"
    1.24 +
    1.25 +#include "gt_connect.h"
    1.26 +#include "gt_node.h"
    1.27 +#include "gt_node_list.h"
    1.28 +#include "gt_netorg.h"
    1.29 +
    1.30 +#include "gt_web_cache.h"
    1.31 +#include "gt_conf.h"
    1.32 +
    1.33 +#include "dns.h"
    1.34 +
    1.35 +/*****************************************************************************/
    1.36 +
    1.37 +/* minimum time to wait before reconnecting to a webcache */
    1.38 +#define CACHE_RETRY_TIME             (8 * EHOURS)
    1.39 +
    1.40 +/*****************************************************************************/
    1.41 +
    1.42 +/* number of times we have hit some gwebcaches */
    1.43 +static int              cache_hits;
    1.44 +
    1.45 +/* the absolute next time we will allow ourselves to access a cache */
    1.46 +static time_t           next_atime;
    1.47 +
    1.48 +/* amount of time to layoff the caches once we've received some data */
    1.49 +static time_t           backoff_time    = 1 * EHOURS;
    1.50 +
    1.51 +/* holds all the caches */
    1.52 +static FileCache       *web_caches;
    1.53 +
    1.54 +/* proxy server to contact */
    1.55 +static char            *gt_proxy_server;
    1.56 +
    1.57 +/* webcaches that succeeded connecting, but returned errors or an
    1.58 + * unparseable response */
    1.59 +static FileCache       *bad_caches;
    1.60 +
    1.61 +/* whether we are in the process of checking the caches */
    1.62 +static BOOL             checking_caches;
    1.63 +
    1.64 +/*****************************************************************************/
    1.65 +
    1.66 +static void parse_hostfile_response (HttpRequest *http_req, char *hosts_file);
    1.67 +static void parse_urlfile_response  (HttpRequest *http_req, char *url_file);
    1.68 +
    1.69 +/*****************************************************************************/
    1.70 +
    1.71 +/* parse the extended data in the webcaches file, now its just mtime */
    1.72 +static BOOL parse_web_cache_value (char *value, time_t *r_atime)
    1.73 +{
    1.74 +	time_t atime;
    1.75 +
    1.76 +	if ((atime = ATOUL (value)) == (unsigned long) -1)
    1.77 +		atime = 0;
    1.78 +
    1.79 +	if (r_atime)
    1.80 +		*r_atime = atime;
    1.81 +
    1.82 +	return TRUE;
    1.83 +}
    1.84 +
    1.85 +/*****************************************************************************/
    1.86 +
    1.87 +static char *new_webcache_url (const char *host, const char *path)
    1.88 +{
    1.89 +	return stringf_dup ("http://%s/%s", host, STRING_NOTNULL(path));
    1.90 +}
    1.91 +
    1.92 +static void ban_webcache (HttpRequest *req, const char *why)
    1.93 +{
    1.94 +	char *url;
    1.95 +
    1.96 +	url = new_webcache_url (req->host, req->path);
    1.97 +	GT->dbg (GT, "banning webcache %s", url);
    1.98 +
    1.99 +	file_cache_insert (bad_caches, url, why);
   1.100 +	file_cache_sync (bad_caches);
   1.101 +
   1.102 +	free (url);
   1.103 +}
   1.104 +
   1.105 +static void insert_webcache (const char *host_name, const char *remote_path,
   1.106 +                             time_t atime)
   1.107 +{
   1.108 +	char *url;
   1.109 +	char *field;
   1.110 +
   1.111 +	url   = new_webcache_url (host_name, remote_path);
   1.112 +	field = stringf_dup ("%lu", atime);
   1.113 +
   1.114 +	file_cache_insert (web_caches, url, field);
   1.115 +
   1.116 +	free (url);
   1.117 +	free (field);
   1.118 +}
   1.119 +
   1.120 +/*****************************************************************************/
   1.121 +
   1.122 +static void handle_close_request (HttpRequest *req, int error_code)
   1.123 +{
   1.124 +	String *s;
   1.125 +
   1.126 +	if (error_code < 0 || error_code < 200 || error_code >= 300)
   1.127 +	{
   1.128 +		if (error_code == -1)
   1.129 +		{
   1.130 +			/* the error was our fault, out of mem, etc. dont do anything */
   1.131 +			GT->DBGFN (GT, "connect to server %s failed for some reason",
   1.132 +			           req->host);
   1.133 +		}
   1.134 +		else
   1.135 +		{
   1.136 +			char err[32];
   1.137 +
   1.138 +			snprintf (err, sizeof(err), "Received error %d", error_code);
   1.139 +
   1.140 +			/*
   1.141 +			 * Not found, internal server error, or too many redirects: ban
   1.142 +			 * the server's URL
   1.143 +			 */
   1.144 +			GT->DBGFN (GT, "server %s returned error %i", req->host,
   1.145 +			           error_code);
   1.146 +			ban_webcache (req, err);
   1.147 +		}
   1.148 +	}
   1.149 +
   1.150 +	/* TODO: this assumes this is the one hostfile request flying around,
   1.151 +	 * and not a urlfile request, which probably needs to be handled
   1.152 +	 * separately */
   1.153 +	checking_caches = FALSE;
   1.154 +
   1.155 +	if ((s = req->data))
   1.156 +		string_free (s);
   1.157 +}
   1.158 +
   1.159 +static void parse_hostfile_response (HttpRequest *http_req, char *host_file)
   1.160 +{
   1.161 +	int      hosts = 0;
   1.162 +	GtNode  *node;
   1.163 +	time_t   now;
   1.164 +
   1.165 +	if (!host_file)
   1.166 +	{
   1.167 +		GT->DBGFN (GT, "empty host file from %s", http_req->host);
   1.168 +		return;
   1.169 +	}
   1.170 +
   1.171 +	GT->DBGFN (GT, "hostfile from server = %s", host_file);
   1.172 +
   1.173 +	now = time (NULL);
   1.174 +
   1.175 +	/*
   1.176 +	 * If the response start with "ERROR: " (or pseudo-html '<' char), ban the
   1.177 +	 * webcache.
   1.178 +	 */
   1.179 +	if (!strncasecmp (host_file, "ERROR", sizeof ("ERROR") - 1) ||
   1.180 +	    host_file[0] == '<')
   1.181 +	{
   1.182 +		ban_webcache (http_req, "Malformed response content");
   1.183 +		return;
   1.184 +	}
   1.185 +
   1.186 +	while (host_file && *host_file)
   1.187 +	{
   1.188 +		char           *host;
   1.189 +		in_addr_t       ip;
   1.190 +		in_port_t       port;
   1.191 +
   1.192 +		host = string_sep_set (&host_file, "\r\n");
   1.193 +
   1.194 +		ip   = net_ip (string_sep (&host, ":"));
   1.195 +		port = ATOI   (host);
   1.196 +
   1.197 +		if (!port || !ip || ip == INADDR_NONE)
   1.198 +			continue;
   1.199 +
   1.200 +		GT->DBGFN (GT, "registering %s:%hu (from cache %s)", net_ip_str (ip),
   1.201 +		           port, http_req->host);
   1.202 +
   1.203 +		/* register the hosts as ultrapeers */
   1.204 +		node = gt_node_register (ip, port, GT_NODE_ULTRA);
   1.205 +		hosts++;
   1.206 +
   1.207 +		if (!node)
   1.208 +			continue;
   1.209 +
   1.210 +		/* set the vitality on this node to preserve it across restarts */
   1.211 +		node->vitality = now;
   1.212 +
   1.213 +		/* might be connected already */
   1.214 +		if (node->state != GT_NODE_DISCONNECTED)
   1.215 +			continue;
   1.216 +
   1.217 +		/* try to connect to the first 5 */
   1.218 +		if (hosts <= 5 && gt_conn_need_connections (GT_NODE_ULTRA))
   1.219 +			gt_connect (node);
   1.220 +
   1.221 +		/* don't allow the cache to register an infinite number of hosts */
   1.222 +		if (hosts >= 50)
   1.223 +			break;
   1.224 +	}
   1.225 +
   1.226 +	/* save the nodes we added to disk so we dont hit the caches again */
   1.227 +	gt_node_list_save ();
   1.228 +
   1.229 +	/*
   1.230 +	 * Do an exponential backoff from the caches. If we were online and
   1.231 +	 * able to receive data, we should be getting node information
   1.232 +	 * some other way now.
   1.233 +	 */
   1.234 +	if (hosts >= 5)
   1.235 +	{
   1.236 +		next_atime    = now + backoff_time;
   1.237 +		backoff_time *= 2;
   1.238 +	}
   1.239 +}
   1.240 +
   1.241 +static void parse_urlfile_response (HttpRequest *http_req, char *url_file)
   1.242 +{
   1.243 +	int caches = 0;
   1.244 +
   1.245 +	if (!url_file)
   1.246 +	{
   1.247 +		GT->DBGFN (GT, "empty url file from %s", http_req->host);
   1.248 +		return;
   1.249 +	}
   1.250 +
   1.251 +	GT->DBGFN (GT, "urlfile from server = %s", url_file);
   1.252 +
   1.253 +	while (url_file && *url_file)
   1.254 +	{
   1.255 +		char *url;
   1.256 +		char *host_name;
   1.257 +		char *remote_path;
   1.258 +
   1.259 +		url = string_sep_set (&url_file, "\r\n");
   1.260 +
   1.261 +		/* skip past http:// */
   1.262 +		string_sep (&url, "http://");
   1.263 +
   1.264 +		host_name   = string_sep (&url, "/");
   1.265 +		remote_path = url;
   1.266 +
   1.267 +		/* NOTE: remote_path is possibly empty */
   1.268 +		if (!host_name)
   1.269 +			continue;
   1.270 +
   1.271 +		url = stringf ("http://%s/%s", host_name, STRING_NOTNULL(remote_path));
   1.272 +
   1.273 +		/* if the webcache is already in our db, skip it */
   1.274 +		if (file_cache_lookup (web_caches, url))
   1.275 +			continue;
   1.276 +
   1.277 +		/*
   1.278 +		 * Only allow caches to register two more caches: this
   1.279 +		 * small number helps to avoid our list of caches getting
   1.280 +		 * polluted.
   1.281 +		 */
   1.282 +		if (++caches > 2)
   1.283 +			break;
   1.284 +
   1.285 +		/* format is: <url> <last time visited> */
   1.286 +		file_cache_insert (web_caches, url, "0");
   1.287 +	}
   1.288 +
   1.289 +	/* sync the pending web caches to disk */
   1.290 +	file_cache_sync (web_caches);
   1.291 +}
   1.292 +
   1.293 +static void end_request (HttpRequest *req, char *data)
   1.294 +{
   1.295 +	char *str = req->request;
   1.296 +
   1.297 +	if (str && !strncmp (str, "hostfile", strlen ("hostfile")))
   1.298 +		parse_hostfile_response (req, data);
   1.299 +	else if (str && !strncmp (str, "urlfile", strlen ("urlfile")))
   1.300 +		parse_urlfile_response (req, data);
   1.301 +	else
   1.302 +		abort ();
   1.303 +}
   1.304 +
   1.305 +/*****************************************************************************/
   1.306 +
   1.307 +/*
   1.308 + * Return TRUE if newname is in the same domain as oldname.  For example,
   1.309 + * "new.gwc.example.com", "example.com", and "cache.example.com" are all
   1.310 + * considered in the same domain as "www.example.com".
   1.311 + *
   1.312 + * This is called on redirects, to make sure the cache can't redirect to an
   1.313 + * innocent site as part of a DDoS attack.
   1.314 + */
   1.315 +static BOOL in_same_domain (const char *oldname, const char *newname)
   1.316 +{
   1.317 +	return FALSE;
   1.318 +#if 0
   1.319 +	const char *p;
   1.320 +	const char *largest = NULL;
   1.321 +	int         periods = 0;
   1.322 +
   1.323 +	p = newname;
   1.324 +
   1.325 +	/* get the largest common substring */
   1.326 +	while (p != NULL)
   1.327 +	{
   1.328 +		if ((largest = strstr (oldname, p)))
   1.329 +			break;
   1.330 +
   1.331 +		/* advance to next domain part */
   1.332 +		p = strchr (p + 1, '.');
   1.333 +	}
   1.334 +
   1.335 +	if (!largest)
   1.336 +		return FALSE;
   1.337 +
   1.338 +	/*
   1.339 +	 * Make sure the substring matches completely to the end.  This will
   1.340 +	 * actually fail when it shouldn't if one name includes the '.' toplevel
   1.341 +	 * domain and one doesn't.  Oh well.
   1.342 +	 */
   1.343 +	if (strcmp (largest, p) != 0)
   1.344 +		return FALSE;
   1.345 +
   1.346 +	/*
   1.347 +	 * Count the number of periods to find the number of subdomains in the
   1.348 +	 * largest common substring.
   1.349 +	 */
   1.350 +	for (p = largest; *p != 0; p++)
   1.351 +	{
   1.352 +		if (*p == '.')
   1.353 +			periods++;
   1.354 +	}
   1.355 +
   1.356 +	/*
   1.357 +	 * If the last character is the root '.', subtract one, since we are
   1.358 +	 * looking for the number of common subdomains, and the root is shared by
   1.359 +	 * all names.
   1.360 +	 */
   1.361 +	if (largest[strlen (largest) - 1] == '.')
   1.362 +		periods--;
   1.363 +
   1.364 +	/*
   1.365 +	 * If there are two periods, at least two toplevel domains match.
   1.366 +	 */
   1.367 +	if (periods >= 2)
   1.368 +		return TRUE;
   1.369 +
   1.370 +	/*
   1.371 +	 * If there is only one period shared, the names MAY be in the same
   1.372 +	 * domain: one of the names has to be completely contained within the
   1.373 +	 * other, such as the case of "foo.example.com" and "example.com".
   1.374 +	 */
   1.375 +	if (periods == 1 &&
   1.376 +	    (strcmp (largest, oldname) == 0 || strcmp (largest, newname) == 0))
   1.377 +	{
   1.378 +		return TRUE;
   1.379 +	}
   1.380 +
   1.381 +	/* not in same domain */
   1.382 +	return FALSE;
   1.383 +#endif
   1.384 +}
   1.385 +
   1.386 +/*
   1.387 + * Called to when the webcache sends a 300-level response with a provided
   1.388 + * Location: header.  Have to make sure the domain the cache directs us
   1.389 + * to is the same.
   1.390 + */
   1.391 +static BOOL handle_redirect (HttpRequest *req, const char *new_host,
   1.392 +                             const char *new_path)
   1.393 +{
   1.394 +	assert (new_host != NULL);
   1.395 +
   1.396 +	if (in_same_domain (req->host, new_host) == FALSE)
   1.397 +		return FALSE;
   1.398 +
   1.399 +	/* might want to do something else if the ban list later becomes per host
   1.400 +	 * rather than per URL */
   1.401 +	ban_webcache (req, "Redirected");
   1.402 +
   1.403 +	GT->DBGFN (GT, "Redirecting to new webcache %s/%s", new_host, new_path);
   1.404 +
   1.405 +	insert_webcache (new_host, new_path, time (NULL));
   1.406 +	file_cache_sync (web_caches);
   1.407 +
   1.408 +	return TRUE;
   1.409 +}
   1.410 +
   1.411 +/*****************************************************************************/
   1.412 +
   1.413 +static BOOL handle_recv (HttpRequest *req, char *data, size_t len)
   1.414 +{
   1.415 +	String *s;
   1.416 +
   1.417 +	/* EOF */
   1.418 +	if (!data)
   1.419 +	{
   1.420 +		char *str = NULL;
   1.421 +
   1.422 +		if ((s = req->data))
   1.423 +			str = s->str;
   1.424 +
   1.425 +		GT->DBGFN (GT, "read %s from server %s", str, req->host);
   1.426 +		end_request (req, str);
   1.427 +
   1.428 +		/* clear data link */
   1.429 +		req->data = NULL;
   1.430 +
   1.431 +		return TRUE;
   1.432 +	}
   1.433 +
   1.434 +	if (!len)
   1.435 +		return TRUE;
   1.436 +
   1.437 +	GT->DBGFN (GT, "server sent us: %s", data);
   1.438 +
   1.439 +	if (!(s = req->data) && !(s = req->data = string_new (NULL, 0, 0, TRUE)))
   1.440 +		return FALSE;
   1.441 +
   1.442 +	if (string_append (s, data) != len)
   1.443 +	{
   1.444 +		GT->DBGFN (GT, "string append failed");
   1.445 +		return FALSE;
   1.446 +	}
   1.447 +
   1.448 +	return TRUE;
   1.449 +}
   1.450 +
   1.451 +/*****************************************************************************/
   1.452 +
   1.453 +static BOOL handle_add_headers (HttpRequest *req, Dataset **headers)
   1.454 +{
   1.455 +	/* don't let intermediaries cache our request, I think */
   1.456 +	dataset_insertstr (headers, "Cache-Control", "no-cache");
   1.457 +
   1.458 +	return TRUE;
   1.459 +}
   1.460 +
   1.461 +/*****************************************************************************/
   1.462 +
   1.463 +static BOOL parse_host_and_port (char **r_host, in_port_t *r_port)
   1.464 +{
   1.465 +	char  *str;
   1.466 +	char  *host;
   1.467 +	long   port;
   1.468 +
   1.469 +	str = *r_host;
   1.470 +
   1.471 +	if (r_port)
   1.472 +		*r_port = 80;
   1.473 +
   1.474 +	/* skip leading 'http://' if found */
   1.475 +	if (strstr (str, "http://"))
   1.476 +		str += strlen ("http://");
   1.477 +
   1.478 +	host = string_sep (&str, ":");
   1.479 +
   1.480 +	if (!host)
   1.481 +		return FALSE;
   1.482 +
   1.483 +	*r_host = host;
   1.484 +
   1.485 +	if (str && !string_isempty (str))
   1.486 +	{
   1.487 +		port = gift_strtol (str);
   1.488 +
   1.489 +		/* make sure port is valid */
   1.490 +		if (port <= 0 || port >= 65536)
   1.491 +			return FALSE;
   1.492 +
   1.493 +		*r_port = port;
   1.494 +	}
   1.495 +
   1.496 +	return TRUE;
   1.497 +}
   1.498 +
   1.499 +static TCPC *open_http_connection (HttpRequest *req, const char *http_name)
   1.500 +{
   1.501 +	in_addr_t       ip;
   1.502 +	in_port_t       port;
   1.503 +	char           *str;
   1.504 +	char           *name;
   1.505 +	TCPC           *c;
   1.506 +	struct hostent *host;
   1.507 +
   1.508 +	if (!http_name)
   1.509 +		return NULL;
   1.510 +
   1.511 +	if (!(str = STRDUP (http_name)))
   1.512 +		return NULL;
   1.513 +
   1.514 +	name = str;
   1.515 +
   1.516 +	if (!parse_host_and_port (&name, &port))
   1.517 +	{
   1.518 +		GT->DBGFN (GT, "error parsing hostname \"%s\"", str);
   1.519 +		free (str);
   1.520 +		return NULL;
   1.521 +	}
   1.522 +
   1.523 +	if (!(host = gt_dns_lookup (name)))
   1.524 +	{
   1.525 +		free (str);
   1.526 +		return NULL;
   1.527 +	}
   1.528 +
   1.529 +	/* ip is in network-order already */
   1.530 +	memcpy (&ip, host->h_addr, MIN (host->h_length, sizeof (ip)));
   1.531 +
   1.532 +	if (net_match_host (ip, "LOCAL"))
   1.533 +	{
   1.534 +		free (str);
   1.535 +		ban_webcache (req, "Resolved to local IP");
   1.536 +		return NULL;
   1.537 +	}
   1.538 +
   1.539 +	c = tcp_open (ip, port, FALSE);
   1.540 +	if (!c)
   1.541 +	{
   1.542 +		GT->DBGFN (GT, "couldn't open connection to %s [%s]: %s",
   1.543 +		           http_name, net_ip_str (ip), GIFT_NETERROR());
   1.544 +	}
   1.545 +
   1.546 +	free (str);
   1.547 +	return c;
   1.548 +}
   1.549 +
   1.550 +/* return the name we have to lookup */
   1.551 +static char *get_http_name (char *name)
   1.552 +{
   1.553 +	char  *proxy;
   1.554 +	char  *host;
   1.555 +
   1.556 +	host  = name;
   1.557 +	proxy = HTTP_PROXY;
   1.558 +
   1.559 +	string_trim (proxy);
   1.560 +
   1.561 +	if (proxy && !string_isempty (proxy))
   1.562 +	{
   1.563 +		/* connect to the proxy instead */
   1.564 +		if (STRCMP (proxy, gt_proxy_server) != 0)
   1.565 +		{
   1.566 +			GT->DBGFN (GT, "using proxy server %s", proxy);
   1.567 +			free (gt_proxy_server);
   1.568 +			gt_proxy_server = STRDUP (proxy);
   1.569 +		}
   1.570 +
   1.571 +		host = proxy;
   1.572 +	}
   1.573 +
   1.574 +	return host;
   1.575 +}
   1.576 +
   1.577 +static void check_dns_error (const char *name, HttpRequest *req)
   1.578 +{
   1.579 +	int error;
   1.580 +
   1.581 +	error = gt_dns_get_errno ();
   1.582 +
   1.583 +	if (!error)
   1.584 +		return;
   1.585 +
   1.586 +	GT->DBGFN (GT, "lookup failed on \"%s\": %s", name, gt_dns_strerror(error));
   1.587 +
   1.588 +	/* ban the host, but only if not using a proxy server */
   1.589 +	if (error == HOST_NOT_FOUND && gt_proxy_server == NULL)
   1.590 +	{
   1.591 +		GT->DBGFN (GT, "webcache \"%s\" not in DNS. banning", name);
   1.592 +		ban_webcache (req, "Host not found in DNS");
   1.593 +		return;
   1.594 +	}
   1.595 +}
   1.596 +
   1.597 +static BOOL make_request (char *host_name, char *remote_path, char *request)
   1.598 +{
   1.599 +	HttpRequest    *req;
   1.600 +	TCPC           *c;
   1.601 +	char           *resolve_name;
   1.602 +	char           *url;
   1.603 +
   1.604 +	url = stringf_dup ("http://%s/%s", host_name, STRING_NOTNULL(remote_path));
   1.605 +
   1.606 +	if (!(req = gt_http_request_new (url, request)))
   1.607 +	{
   1.608 +		free (url);
   1.609 +		return FALSE;
   1.610 +	}
   1.611 +
   1.612 +	free (url);
   1.613 +
   1.614 +	resolve_name = get_http_name (host_name);
   1.615 +
   1.616 +	gt_dns_set_errno (0);
   1.617 +
   1.618 +	if (!(c = open_http_connection (req, resolve_name)))
   1.619 +	{
   1.620 +		check_dns_error (resolve_name, req);
   1.621 +		gt_http_request_close (req, -1);
   1.622 +		return FALSE;
   1.623 +	}
   1.624 +
   1.625 +	GT->DBGFN (GT, "opening connection to %s [%s]",
   1.626 +	           resolve_name, net_ip_str (c->host));
   1.627 +
   1.628 +	req->recv_func       = handle_recv;
   1.629 +	req->add_header_func = handle_add_headers;
   1.630 +	req->close_req_func  = handle_close_request;
   1.631 +	req->redirect_func   = handle_redirect;
   1.632 +
   1.633 +	gt_http_request_set_conn    (req, c);               /* setup references */
   1.634 +	gt_http_request_set_proxy   (req, gt_proxy_server); /* maybe use proxy */
   1.635 +	gt_http_request_set_timeout (req, 2 * MINUTES);     /* don't wait forever */
   1.636 +	gt_http_request_set_max_len (req, 65536);           /* don't read forever */
   1.637 +
   1.638 +	input_add (c->fd, c, INPUT_WRITE,
   1.639 +	           (InputCallback)gt_http_request_handle, TIMEOUT_DEF);
   1.640 +
   1.641 +	return TRUE;
   1.642 +}
   1.643 +
   1.644 +/*****************************************************************************/
   1.645 +
   1.646 +struct find_rand_args
   1.647 +{
   1.648 +	int    n;
   1.649 +	time_t now;
   1.650 +	char  *url;
   1.651 +	char  *field;
   1.652 +};
   1.653 +
   1.654 +/* get a random cache from the webcaches dataset */
   1.655 +static void foreach_rand_cache (ds_data_t *key, ds_data_t *value,
   1.656 +                                struct find_rand_args *args)
   1.657 +{
   1.658 +	time_t  atime;
   1.659 +	float   range = args->n;
   1.660 +	char   *str;
   1.661 +	char   *url   = key->data;
   1.662 +	char   *hostname, *path;
   1.663 +	int     ret;
   1.664 +
   1.665 +	if (!parse_web_cache_value (value->data, &atime))
   1.666 +		return;
   1.667 +
   1.668 +	/* skip the cache entirely if we've retried too soon */
   1.669 +	if (args->now - atime < CACHE_RETRY_TIME)
   1.670 +		return;
   1.671 +
   1.672 +	/*
   1.673 +	 * Make sure the cache has a parseable url
   1.674 +	 *
   1.675 +	 * TODO: This is ugly, it really should be parsed into a
   1.676 +	 * a data structure once instead.
   1.677 +	 */
   1.678 +	str = STRDUP (url);
   1.679 +	ret = gt_http_url_parse (str, &hostname, &path);
   1.680 +	free (str);
   1.681 +
   1.682 +	if (!ret)
   1.683 +	{
   1.684 +		GT->warn (GT, "bad webcache url \"%s\" from %s/gwebcaches",
   1.685 +		          key->data, gift_conf_path ("Gnutella"));
   1.686 +		return;
   1.687 +	}
   1.688 +
   1.689 +	/* decrease probability of selecting the next web cache */
   1.690 +	args->n++;
   1.691 +
   1.692 +	/*
   1.693 +	 * Select this webcache with probability 1/n.
   1.694 +	 *
   1.695 +	 * Also select this cache if we haven't chosen one yet, which may be the
   1.696 +	 * case on if the index of the cache is > 0 when there are banned caches.
   1.697 +	 */
   1.698 +	if (args->url == NULL ||
   1.699 +	    range * rand() / (RAND_MAX + 1.0) < 1.0)
   1.700 +	{
   1.701 +		char *keystr   = key->data;
   1.702 +		char *valuestr = value->data;
   1.703 +
   1.704 +		/* check if this is a bad gwebcache */
   1.705 +		if (file_cache_lookup (bad_caches, url))
   1.706 +		{
   1.707 +#if 1
   1.708 +			GT->warn (GT, "skipping webcache %s, in bad gwebcaches", url);
   1.709 +#endif
   1.710 +			/* pretend we didn't select this to ensure equal distribution */
   1.711 +			args->n--;
   1.712 +
   1.713 +			return;
   1.714 +		}
   1.715 +
   1.716 +		/* free the old values */
   1.717 +		free (args->url);
   1.718 +		free (args->field);
   1.719 +
   1.720 +		args->url   = STRDUP (keystr);
   1.721 +		args->field = STRDUP (valuestr);
   1.722 +	}
   1.723 +}
   1.724 +
   1.725 +static BOOL get_random_cache (time_t now, char **r_host_name,
   1.726 +                              char **r_remote_path)
   1.727 +{
   1.728 +	int                    ret;
   1.729 +	struct find_rand_args  args;
   1.730 +
   1.731 +	args.n     = 1;         /* initial probability */
   1.732 +	args.now   = now;       /* current time */
   1.733 +	args.url   = NULL;
   1.734 +	args.field = NULL;
   1.735 +
   1.736 +	dataset_foreach (web_caches->d, DS_FOREACH(foreach_rand_cache), &args);
   1.737 +
   1.738 +	if (!args.url)
   1.739 +	{
   1.740 +		GT->DBGFN (GT, "couldn't find random cache");
   1.741 +		return FALSE;
   1.742 +	}
   1.743 +
   1.744 +	ret = gt_http_url_parse (args.url, r_host_name, r_remote_path);
   1.745 +
   1.746 +	if (!*r_host_name || !*r_remote_path)
   1.747 +	{
   1.748 +		free (args.url);
   1.749 +		free (args.field);
   1.750 +		return FALSE;
   1.751 +	}
   1.752 +
   1.753 +	*r_host_name   = STRDUP (*r_host_name);
   1.754 +	*r_remote_path = STRDUP (*r_remote_path);
   1.755 +
   1.756 +	/* free the original buffer */
   1.757 +	free (args.url);
   1.758 +	free (args.field);
   1.759 +
   1.760 +	return ret;
   1.761 +}
   1.762 +
   1.763 +static void access_gwebcaches (void)
   1.764 +{
   1.765 +	int     len;
   1.766 +	char   *host_name;
   1.767 +	char   *remote_path;
   1.768 +	time_t  now;
   1.769 +	int     host_requests = 0;
   1.770 +#if 0
   1.771 +	int     url_requests  = 0;
   1.772 +#endif
   1.773 +	int     max_requests = 1;
   1.774 +	BOOL    ret;
   1.775 +	BOOL    need_sync;
   1.776 +
   1.777 +	/*
   1.778 +	 * We may get called while a check of the gwebcaches is already
   1.779 +	 * in progress.
   1.780 +	 */
   1.781 +	if (checking_caches)
   1.782 +	{
   1.783 +		GT->DBGFN (GT, "Access already in progress");
   1.784 +		return;
   1.785 +	}
   1.786 +
   1.787 +	now = time (NULL);
   1.788 +
   1.789 +	len = dataset_length (web_caches->d);
   1.790 +
   1.791 +	if (max_requests > len)
   1.792 +		max_requests = len;
   1.793 +
   1.794 +	need_sync = FALSE;
   1.795 +
   1.796 +	while (host_requests < max_requests)
   1.797 +	{
   1.798 +		if (!get_random_cache (now, &host_name, &remote_path))
   1.799 +		{
   1.800 +			GT->DBGFN (GT, "error looking up cache");
   1.801 +			break;
   1.802 +		}
   1.803 +
   1.804 +#if 0
   1.805 +		/* make a url request sometimes to keep the cache file up to date, but
   1.806 +		 * mostly ask for hosts */
   1.807 +		if (10.0 * rand() / (RAND_MAX + 1.0) < 1.0)
   1.808 +		{
   1.809 +			ret = make_request (host_name, remote_path,
   1.810 +			                    "urlfile=1&client=GIFT&version=" GT_VERSION);
   1.811 +			url_requests++;
   1.812 +		}
   1.813 +		else
   1.814 +#endif
   1.815 +		{
   1.816 +			ret = make_request (host_name, remote_path,
   1.817 +			                    "hostfile=1&client=GIFT&version=" GT_VERSION);
   1.818 +
   1.819 +			if (ret)
   1.820 +				checking_caches = TRUE;
   1.821 +
   1.822 +			host_requests++;
   1.823 +		}
   1.824 +
   1.825 +		if (ret)
   1.826 +		{
   1.827 +			GT->DBGFN (GT, "hitting web cache [total cache hits %u] "
   1.828 +			           "(cache: http://%s/%s)", cache_hits,
   1.829 +			           host_name, STRING_NOTNULL(remote_path));
   1.830 +
   1.831 +			cache_hits++;
   1.832 +			need_sync = TRUE;
   1.833 +
   1.834 +			/* reset the atime for the cache */
   1.835 +			insert_webcache (host_name, remote_path, now);
   1.836 +		}
   1.837 +
   1.838 +		free (host_name);
   1.839 +		free (remote_path);
   1.840 +	}
   1.841 +
   1.842 +	/* only sync when we successfully accessed a cache */
   1.843 +	if (need_sync)
   1.844 +		file_cache_sync (web_caches);
   1.845 +}
   1.846 +
   1.847 +static BOOL webcache_update (void *udata)
   1.848 +{
   1.849 +	char       *webcache_file;
   1.850 +	int         web_exists;
   1.851 +	time_t      now;
   1.852 +	size_t      nodes_len;
   1.853 +	struct stat st;
   1.854 +
   1.855 +	if (GNUTELLA_LOCAL_MODE)
   1.856 +		return TRUE;
   1.857 +
   1.858 +	now = time (NULL);
   1.859 +	nodes_len = gt_conn_length (GT_NODE_NONE, GT_NODE_ANY);
   1.860 +
   1.861 +	/*
   1.862 +	 * If we've already accessed the caches successfully, we won't
   1.863 +	 * allow another access to go through, _unless_ the node list
   1.864 +	 * is small enough, in which case it could be we really do need
   1.865 +	 * to access the caches.
   1.866 +	 */
   1.867 +	if (now < next_atime && nodes_len >= 20)
   1.868 +		return FALSE;
   1.869 +
   1.870 +	webcache_file = STRDUP (gift_conf_path ("Gnutella/gwebcaches"));
   1.871 +	web_exists = file_stat (webcache_file, &st);
   1.872 +
   1.873 +	if (!web_exists)
   1.874 +	{
   1.875 +		GIFT_ERROR (("gwebcaches file doesn't exist"));
   1.876 +		return FALSE;
   1.877 +	}
   1.878 +
   1.879 +	/*
   1.880 +	 * next_atime, the absolute next time we allow ourselves to contact the
   1.881 +	 * caches, gets set when we sucessfully access the caches, and if we
   1.882 +	 * manage to get some hosts from a cache we access in an exponentially
   1.883 +	 * decreasing interval.
   1.884 +	 */
   1.885 +	access_gwebcaches ();
   1.886 +
   1.887 +	free (webcache_file);
   1.888 +	return TRUE;
   1.889 +}
   1.890 +
   1.891 +/*****************************************************************************/
   1.892 +
   1.893 +void gt_web_cache_update (void)
   1.894 +{
   1.895 +	webcache_update (NULL);
   1.896 +}
   1.897 +
   1.898 +BOOL gt_web_cache_init (void)
   1.899 +{
   1.900 +	/*
   1.901 +	 * Copy the gwebcaches file to from the data dir to
   1.902 +	 * ~/.giFT/Gnutella if it is newer or if ~/.giFT/Gnutella/gwebcaches
   1.903 +	 * doesn't exist.
   1.904 +	 */
   1.905 +	gt_config_load_file ("Gnutella/gwebcaches", TRUE, FALSE);
   1.906 +
   1.907 +	web_caches = file_cache_new (gift_conf_path ("Gnutella/gwebcaches"));
   1.908 +	bad_caches = file_cache_new (gift_conf_path ("Gnutella/bad_gwebcaches"));
   1.909 +
   1.910 +	if (!web_caches)
   1.911 +		return FALSE;
   1.912 +
   1.913 +	return TRUE;
   1.914 +}
   1.915 +
   1.916 +void gt_web_cache_cleanup (void)
   1.917 +{
   1.918 +	file_cache_free (web_caches);
   1.919 +	web_caches = NULL;
   1.920 +
   1.921 +	file_cache_free (bad_caches);
   1.922 +	bad_caches = NULL;
   1.923 +
   1.924 +	cache_hits = 0;
   1.925 +	next_atime = 0;
   1.926 +
   1.927 +	checking_caches = FALSE;
   1.928 +}